├── .github └── workflows │ ├── bump-version.yml │ ├── ci.yml │ └── release.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── MANIFEST.in ├── NOTICES.txt ├── README.md ├── README_dev.md ├── access └── policy_template.json ├── builtin ├── __init__.py └── ingest.py ├── docs ├── access-management.md ├── architecture.md ├── data-product-processor-arch.png ├── data-product-specification.md └── how-to │ ├── custom-dependencies.md │ ├── local-development.md │ └── transformation-logic.md ├── driver ├── __init__.py ├── aws │ ├── __init__.py │ ├── datalake_api.py │ ├── glue_api.py │ ├── providers.py │ └── resolvers.py ├── common.py ├── core.py ├── driver.py ├── io_handlers.py ├── packager.py ├── processors.py ├── schema │ └── 1.rc-1 │ │ ├── model.json │ │ └── product.json ├── task_executor.py └── util.py ├── main.py ├── package.py ├── pytest.ini ├── requirements-test.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── assets │ ├── aws_api_rsps │ │ ├── aws_glue_dc_connection.json │ │ ├── glue_dc_get_db_rsp.json │ │ ├── glue_dc_get_db_rsps.json │ │ └── glue_gc_get_table_rsp.json │ ├── integration │ │ ├── model.yml │ │ ├── product.yml │ │ └── tasks │ │ │ ├── __init__.py │ │ │ └── custom_business_logic.py │ ├── integration_calendar │ │ ├── model.yml │ │ └── product.yml │ ├── integration_file │ │ ├── model.yml │ │ └── product.yml │ ├── integration_sport_events │ │ ├── __init__.py │ │ ├── model.yml │ │ ├── product.yml │ │ └── tasks │ │ │ ├── __init__.py │ │ │ └── custom_aggregate_events.py │ └── metafiles │ │ ├── model.yml │ │ ├── model_compilation.yml │ │ ├── model_correct.yml │ │ ├── model_remove_xtra_columns.yml │ │ ├── model_strict_validation.yml │ │ ├── product.yml │ │ ├── product_compilation.yml │ │ ├── product_correct.yml │ │ ├── product_correct_all_models.yml │ │ ├── product_correct_connection_w_model.yml │ │ ├── product_correct_missing_logic_params.yml │ │ ├── product_input_file.yml │ │ ├── product_missing_logic.yml │ │ ├── product_wrong_engine.yml │ │ └── product_wrong_output.yml ├── aws │ ├── __init__.py │ └── test_datalake.py ├── catalog │ ├── __init__.py │ └── test_catalog.py ├── conftest.py ├── test_constraint_checkers.py ├── test_core.py ├── test_df_schema_validator.py ├── test_model_compilation.py ├── test_task_executor.py └── test_util.py └── version.sh /.github/workflows/bump-version.yml: -------------------------------------------------------------------------------- 1 | name: bump-version 2 | 3 | on: 4 | push: 5 | tags: 6 | - major-* 7 | - minor-* 8 | - patch-* 9 | 10 | permissions: 11 | contents: write 12 | pull-requests: write 13 | 14 | jobs: 15 | bump-version: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v3 19 | with: 20 | fetch-depth: 0 21 | ref: ${{ github.ref }} 22 | - name: set to major release 23 | if: github.ref_type == 'tag' && startsWith(github.ref_name, 'major-') 24 | run: | 25 | echo "RELEASE_TYPE=major" >> $GITHUB_ENV 26 | - name: set to minor release 27 | if: github.ref_type == 'tag' && startsWith(github.ref_name, 'minor-') 28 | run: | 29 | echo "RELEASE_TYPE=minor" >> $GITHUB_ENV 30 | - name: set to patch release 31 | if: github.ref_type == 'tag' && startsWith(github.ref_name, 'patch-') 32 | run: | 33 | echo "RELEASE_TYPE=patch" >> $GITHUB_ENV 34 | - id: bump2version 35 | name: bump data product processor version 36 | run: | 37 | git config --global user.email "CI" 38 | git config --global user.name "CI@users.noreply.github.com" 39 | 40 | pip install bump2version 41 | . ./version.sh 42 | echo $VERSION 43 | bump2version --current-version $VERSION --tag-name '{new_version}' --tag ${RELEASE_TYPE} 44 | # load new version in environment 45 | . ./version.sh 46 | echo $VERSION 47 | echo "VERSION=$VERSION" >> $GITHUB_ENV 48 | # push tag 49 | git push origin refs/tags/${VERSION} 50 | - name: create pull request 51 | uses: peter-evans/create-pull-request@v4 52 | with: 53 | title: "release version: ${{ env.VERSION }}" 54 | branch: release-${{ env.VERSION }} 55 | delete-branch: true 56 | base: main 57 | labels: | 58 | release 59 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: 6 | - "*" 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | fetch-depth: 0 16 | - name: unit tests 17 | run: | 18 | pip install --upgrade pip 19 | pip install -U -e . 20 | pip install -r requirements-test.txt 21 | pytest --cov=deprecated -s -m 'not integration' 22 | - id: bump2version 23 | name: bump data product processor version to pre-release 24 | run: | 25 | pip install bump2version 26 | . ./version.sh 27 | echo "latest release: ${VERSION}" 28 | head=$(git rev-parse --short HEAD) 29 | commits_since_tag=$(git rev-list ${VERSION}..HEAD --count) 30 | echo "new snapshot: ${VERSION}+${commits_since_tag}.${head}" 31 | bump2version --new-version ${VERSION}+${commits_since_tag}.${head} pre 32 | - name: build wheel 33 | run: | 34 | python setup.py build -vf && python setup.py bdist_wheel 35 | - name: archive dist 36 | uses: actions/upload-artifact@v3 37 | with: 38 | name: dpp-dist 39 | retention-days: 30 40 | path: | 41 | ./dist/* 42 | ./main.py 43 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | outputs: 15 | version: ${{ steps.getversion.outputs.VERSION }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | - id: getversion 19 | name: get version 20 | run: | 21 | . ./version.sh 22 | echo "VERSION=$VERSION" >> $GITHUB_ENV 23 | echo "VERSION=$VERSION" >> $GITHUB_OUTPUT 24 | - name: build wheel 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -U -e . 28 | pip install -r requirements-test.txt 29 | python setup.py build -vf && python setup.py bdist_wheel 30 | - name: archive artifacts 31 | uses: actions/upload-artifact@v3 32 | with: 33 | name: data-product-processor-${{ env.VERSION }} 34 | retention-days: 30 35 | path: | 36 | ./dist/* 37 | ./main.py 38 | publish-test: 39 | needs: build 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/download-artifact@v3 43 | with: 44 | name: data-product-processor-${{ needs.build.outputs.version }} 45 | - name: publish 46 | uses: pypa/gh-action-pypi-publish@release/v1 47 | with: 48 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 49 | repository_url: https://test.pypi.org/legacy/ 50 | print_hash: true 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | .DS_Store/ 7 | .DS_Store 8 | *.py[cod] 9 | *$py.class 10 | pyrightconfig.json 11 | # C extensions 12 | *.so 13 | .vscode 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | spark_deps/ 118 | spark_deps 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | #IDE 139 | .idea/ 140 | */glue-libs/ 141 | 142 | cdk/functions/config_validation/env 143 | cdk/functions/config_validation/package 144 | cdk/functions/config_validation/package.sh 145 | cdk/functions/config_validation/my-deployment-package.zip 146 | 147 | # Logs 148 | logs 149 | *.log 150 | npm-debug.log* 151 | yarn-debug.log* 152 | yarn-error.log* 153 | lerna-debug.log* 154 | .pnpm-debug.log* 155 | 156 | # Diagnostic reports (https://nodejs.org/api/report.html) 157 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 158 | 159 | # Runtime data 160 | pids 161 | *.pid 162 | *.seed 163 | *.pid.lock 164 | 165 | # Dependency directories 166 | node_modules/ 167 | jspm_packages/ 168 | 169 | # Optional npm cache directory 170 | .npm 171 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include driver/schema/*/*.json -------------------------------------------------------------------------------- /NOTICES.txt: -------------------------------------------------------------------------------- 1 | dpac-data-product-processor 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | 4 | ********************** 5 | THIRD PARTY COMPONENTS 6 | ********************** 7 | 8 | Package: attrs 9 | License: MIT 10 | Requires: n/a 11 | Author: Hynek Schlawack 12 | Home page: https://www.attrs.org/ 13 | 14 | ---------------------------------------- 15 | 16 | Package: boto3 17 | License: Apache-2.0 18 | Requires: botocore, jmespath, s3transfer 19 | Author: Amazon Web Services 20 | Home page: https://github.com/boto/boto3 21 | 22 | ---------------------------------------- 23 | 24 | Package: botocore 25 | License: Apache-2.0 26 | Requires: jmespath, python-dateutil, urllib3 27 | Author: Amazon Web Services 28 | Home page: https://github.com/boto/botocore 29 | 30 | ---------------------------------------- 31 | 32 | Package: jmespath 33 | License: MIT 34 | Requires: n/a 35 | Author: James Saryerwinnie 36 | Home page: https://github.com/jmespath/jmespath.py 37 | 38 | ---------------------------------------- 39 | 40 | Package: jsonschema 41 | License: MIT 42 | Requires: attrs, pyrsistent, setuptools, six 43 | Author: Julian Berman 44 | Home page: https://github.com/Julian/jsonschema 45 | 46 | ---------------------------------------- 47 | 48 | Package: mypy-boto3-glue 49 | License: MIT 50 | Requires: n/a 51 | Author: Vlad Emelianov 52 | Home page: https://github.com/vemel/mypy_boto3_builder 53 | 54 | ---------------------------------------- 55 | 56 | Package: pydantic 57 | License: MIT 58 | Requires: typing-extensions 59 | Author: Samuel Colvin 60 | Home page: https://github.com/samuelcolvin/pydantic 61 | 62 | The MIT License (MIT) 63 | 64 | ---------------------------------------- 65 | 66 | Package: pyrsistent 67 | License: MIT 68 | Requires: n/a 69 | Author: Tobias Gustafsson 70 | Home page: http://github.com/tobgu/pyrsistent/ 71 | 72 | ---------------------------------------- 73 | 74 | Package: python-dateutil 75 | License: Apache-2.0 76 | Requires: six 77 | Author: Gustavo Niemeyer 78 | Home page: https://github.com/dateutil/dateutil 79 | 80 | -------------------------------------------------------------------------------- 81 | dateutil - Extensions to the standard Python datetime module. 82 | 83 | Copyright (c) 2003-2011 - Gustavo Niemeyer 84 | Copyright (c) 2012-2014 - Tomi Pieviläinen 85 | Copyright (c) 2014-2016 - Yaron de Leeuw 86 | Copyright (c) 2015- - Paul Ganssle 87 | Copyright (c) 2015- - dateutil contributors (see AUTHORS file) 88 | 89 | All rights reserved. 90 | 91 | Redistribution and use in source and binary forms, with or without 92 | modification, are permitted provided that the following conditions are met: 93 | 94 | * Redistributions of source code must retain the above copyright notice, 95 | this list of conditions and the following disclaimer. 96 | * Redistributions in binary form must reproduce the above copyright notice, 97 | this list of conditions and the following disclaimer in the documentation 98 | and/or other materials provided with the distribution. 99 | * Neither the name of the copyright holder nor the names of its 100 | contributors may be used to endorse or promote products derived from 101 | this software without specific prior written permission. 102 | 103 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 104 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 105 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 106 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 107 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 108 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 109 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 110 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 111 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 112 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 113 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 114 | 115 | The above BSD License Applies to all code, even that also covered by Apache 2.0. 116 | 117 | ---------------------------------------- 118 | 119 | Package: quinn 120 | License: Apache-2.0 121 | Requires: n/a 122 | Author: MrPowers 123 | Home page: https://github.com/MrPowers/quinn/ 124 | 125 | ---------------------------------------- 126 | 127 | Package: s3transfer 128 | License: Apache-2.0 129 | Requires: botocore 130 | Author: Amazon Web Services 131 | Home page: https://github.com/boto/s3transfer 132 | 133 | ---------------------------------------- 134 | 135 | Package: setuptools 136 | License: MIT 137 | Requires: n/a 138 | Author: Python Packaging Authority 139 | Home page: https://github.com/pypa/setuptools 140 | 141 | ---------------------------------------- 142 | 143 | Package: six 144 | License: MIT 145 | Requires: n/a 146 | Author: Benjamin Peterson 147 | Home page: https://github.com/benjaminp/six 148 | 149 | ---------------------------------------- 150 | 151 | Package: urllib3 152 | License: MIT 153 | Requires: n/a 154 | Author: Andrey Petrov 155 | Home page: https://urllib3.readthedocs.io/ 156 | 157 | MIT License 158 | 159 | Copyright (c) 2008-2020 Andrey Petrov and contributors (see CONTRIBUTORS.txt) 160 | 161 | Permission is hereby granted, free of charge, to any person obtaining a copy 162 | of this software and associated documentation files (the "Software"), to deal 163 | in the Software without restriction, including without limitation the rights 164 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 165 | copies of the Software, and to permit persons to whom the Software is 166 | furnished to do so, subject to the following conditions: 167 | 168 | The above copyright notice and this permission notice shall be included in all 169 | copies or substantial portions of the Software. 170 | 171 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 172 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 173 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 174 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 175 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 176 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 177 | SOFTWARE. 178 | 179 | ---------------------------------------- 180 | 181 | Package: wheel 182 | License: MIT 183 | Requires: n/a 184 | Author: Daniel Holth 185 | Home page: https://github.com/pypa/wheel 186 | 187 | ---------------------------------------- 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data product processor 2 | 3 | The data product processor is a library for dynamically creating and executing Apache Spark Jobs based on a declarative description of a data product. 4 | 5 | The declaration is based on YAML and covers input and output data stores as well as data structures. It can be augmented with custom, PySpark-based transformation logic. 6 | 7 | ## Installation 8 | **Prerequisites** 9 | - Python 3.x 10 | - Apache Spark 3.x 11 | 12 | **Install with pip** 13 | ```commandline 14 | pip install data-product-processor 15 | ``` 16 | 17 | ## Getting started 18 | ### Declare a basic data product 19 | Please see [Data product specification](docs/data-product-specification.md) for an overview on the files required to declare a data product. 20 | 21 | ### Process the data product 22 | From folder in which the previously created file are stored, run the data-product-processor as follows: 23 | 24 | ```commandline 25 | data-product-processor \ 26 | --default_data_lake_bucket some-datalake-bucket \ 27 | --aws_profile some-profile \ 28 | --aws_region eu-central-1 \ 29 | --local 30 | ``` 31 | This command will run Apache Spark locally (due to the --local switch) and store the output on an S3 bucket (authenticated with the AWS profile used in the parameter). 32 | 33 | If you want to run the library from a different folder than the data product declaration, reference the latter through the additional argument `--product_path`. 34 | ```commandline 35 | data-product-processor \ 36 | --product_path ../path-to-some-data-product \ 37 | --default_data_lake_bucket some-datalake-bucket \ 38 | --aws_profile some-profile \ 39 | --aws_region eu-central-1 \ 40 | --local 41 | ``` 42 | 43 | ## CLI Arguments 44 | ```commandline 45 | data-product-processor --help 46 | 47 | --JOB_ID - the unique id of this Glue/EMR job 48 | --JOB_RUN_ID - the unique id of this Glue job run 49 | --JOB_NAME - the name of this Glue job 50 | --job-bookmark-option - job-bookmark-disable if you don't want bookmarking 51 | --TempDir - tempoarary results directory 52 | --product_path - the data product definition folder 53 | --aws_profile - the AWS profile to be used for connection 54 | --aws_region - the AWS region to be used 55 | --local - local development 56 | --jars - extra jars to be added to the Spark context 57 | --additional-python-modules - this parameter is injected by Glue, currently it is not in use 58 | --default_data_lake_bucket - a default bucket location (with s3a:// prefix) 59 | ``` 60 | ## References 61 | - [Data product specification](docs/data-product-specification.md) 62 | - [Access management](docs/access-management.md) 63 | 64 | ## Tutorials 65 | - [How to write and test custom transformation logic?](docs/how-to/transformation-logic.md) 66 | - [How to reference custom Spark dependencies?](docs/how-to/custom-dependencies.md) 67 | - [How to set up local development?](docs/how-to/local-development.md) 68 | -------------------------------------------------------------------------------- /README_dev.md: -------------------------------------------------------------------------------- 1 | [![pipeline status](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/badges/master/pipeline.svg)](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/-/commits/master) 2 | [![coverage report](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/badges/master/coverage.svg)](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/-/commits/master) 3 | 4 | # Data Mesh Task Interpreter 5 | 6 | Interprets YAML based task definition of 7 | the [data mesh](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-solution) as AWS Glue job. 8 | 9 | ## Format 10 | 11 | See [model.yml](deprecated_ts/interpreters/model.yml) and [product.yml](deprecated_ts/interpreters/product.yml) 12 | test examples. 13 | 14 | # Setup real-local development environment 15 | 16 | ## Install development environment on OSX 17 | 18 | Everything will be installed in virtual environment in your local project folder. 19 | 20 | ```bash 21 | python3 -m venv .venv 22 | source .venv/bin/activate 23 | pip install -r requirements-test.txt 24 | ``` 25 | 26 | Don't forget to switch the new virtual environment in your IDE too. 27 | 28 | Building the wheel package: 29 | 30 | ```commandline 31 | pip install -U pip wheel setuptools 32 | python3 setup.py bdist_wheel 33 | ``` 34 | As a result you should see 35 | 36 | Also: make sure Java is installed. On OSX: 37 | 38 | ```bash 39 | brew tap homebrew/cask-versions 40 | brew update 41 | brew tap homebrew/cask 42 | brew tap adoptopenjdk/openjdk 43 | brew install --cask adoptopenjdk11 44 | brew install maven 45 | ``` 46 | 47 | Install spark dependencies: 48 | 49 | ```bash 50 | mkdir spark_deps 51 | cd spark_deps 52 | wget https://jdbc.postgresql.org/download/postgresql-42.2.23.jar 53 | ``` 54 | 55 | Install the AWS dependencies for hadoop: 56 | 57 | 1. check the current version of hadoop: ```ll -al .venv/lib/python3.9/site-packages/pyspark/jars |grep hadoop``` 58 | 2. create a POM file in the spark_deps folder (make sure the version field matches the current hadoop version): 59 | 60 | ```xml 61 | 62 | 4.0.0 63 | com.mycompany.app 64 | my-app 65 | 1 66 | 67 | 68 | org.apache.hadoop 69 | hadoop-aws 70 | 3.3.1 71 | 72 | 73 | 74 | ``` 75 | 76 | Download the dependencies: 77 | 78 | ```bash 79 | mvn --batch-mode -f ./pom.xml -DoutputDirectory=./jars dependency:copy-dependencies 80 | mv jars/* . 81 | ``` 82 | 83 | Set the following parameters onto the execution context in your IDE: 84 | 85 | ```commandline 86 | --product_path /tests/assets/integration --default_data_lake_bucket --aws_profile --aws_region --local 87 | ``` 88 | 89 | Alternatively you can run the whole solution from the command line: 90 | ```commandline 91 | data-product-processor --JOB_NAME "TEST" --product_path /tests/assets/integration --default_data_lake_bucket --aws_profile --aws_region 92 | ``` 93 | 94 | Optionally you might need to export Spark Home if the Spark environment is not found in your installation. 95 | 96 | ```commandline 97 | export SPARK_HOME="$(pwd)/.venv/lib/python3.9/site-packages/pyspark" 98 | ``` 99 | 100 | Run the tests from command line (while the virtual environment is activated): 101 | 102 | ```commandline 103 | pytest 104 | ``` 105 | 106 | ## Troubleshooting 107 | 108 | On error: 109 | ``` 110 | py4j.protocol.Py4JError: org.apache.spark.api.python.PythonUtils.getPythonAuthSocketTimeout does not exist in the JVM 111 | ``` 112 | 113 | Type this: 114 | ```commandline 115 | export PYTHONPATH="${SPARK_HOME}/python;${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip;${PYTHONPATH}" 116 | ``` 117 | 118 | # Issues 119 | 120 | ## Sfl4j not found 121 | 122 | ```commandline 123 | [NOT FOUND ] org.slf4j#slf4j-api;1.7.5!slf4j-api.jar 124 | ``` 125 | **Solution** 126 | Remove dir in .ivy2/cache, ivy2/jars and .m2/repository 127 | 128 | ## CI/CD 129 | 130 | The Gitlab based CI/CD pipeline can be dound at: [gitlab-ci.yml](.gitlab-ci.yml). 131 | 132 | ## Setup local Spark playground 133 | 134 | This is a description of an optional and somewhat unrelated step, for setting up an interactive development environment that helps to experiment with Spark concepts in a local environment. 135 | 136 | Make sure that you execute these commands in a virtual environment (see the top of this document for instructions): 137 | 138 | ```commandline 139 | pip install ptpython 140 | ptpython 141 | ``` 142 | 143 | Type the following in the ptpython console:fs.s3a.aws.credentials.provider 144 | 145 | [optional] only if you encounter errors with the larger snippet bellow: 146 | ```python 147 | import findspark 148 | findspark.init() 149 | ``` 150 | Interactive development: 151 | ```python 152 | import sys 153 | import os 154 | from pyspark import SparkConf 155 | from pyspark.sql import SparkSession 156 | from pyspark.sql.types import ( 157 | StringType, 158 | StructField, 159 | StructType, 160 | IntegerType, 161 | LongType, 162 | DoubleType 163 | ) 164 | 165 | os.environ["AWS_PROFILE"] = '' 166 | conf = SparkConf() 167 | conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider") 168 | conf.set("spark.jars", './spark_deps/postgresql-42.2.23.jar') 169 | 170 | spark = SparkSession.builder.appName('repl') \ 171 | .config(conf=conf) \ 172 | .getOrCreate() 173 | 174 | movie_schema = StructType([ 175 | StructField('movieId', IntegerType(), True), 176 | StructField('title', StringType(), True), 177 | StructField('genres', StringType(), True) 178 | ]) 179 | 180 | df = spark.createDataFrame([(1, 'Jumanji(1995)', 'Adventure | Children | Fantasy'), 181 | (2, 'Heat (1995)', 'Action|Crime|Thriller')], 182 | movie_schema) 183 | ``` 184 | Get catalog information: 185 | ```python 186 | import boto3, json 187 | session = boto3.Session(profile_name='', region_name='eu-central-1') 188 | glue = session.client('glue') 189 | s = json.dumps(glue.get_table(DatabaseName='test_db', Name='person'), indent=4, default=str) 190 | print(s) 191 | ``` 192 | -------------------------------------------------------------------------------- /access/policy_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": "s3:GetObject", 7 | "Resource": "arn:aws:s3:::///*", 8 | "Condition": { 9 | "StringEquals": { 10 | "s3:ExistingObjectTag/tag_name": "tag_value" 11 | } 12 | } 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /builtin/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /builtin/ingest.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List 6 | import time 7 | import datetime 8 | from driver.task_executor import DataSet 9 | from pyspark.sql.functions import lit, unix_timestamp 10 | from pyspark.sql import SparkSession 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def execute(inp_datasets: List[DataSet], spark_session: SparkSession, create_timestamp=False): 16 | def resolve_data_set_id(ds: DataSet): 17 | model_id_raw = None 18 | if ds.model: 19 | model_id_raw = ds.model.id 20 | else: 21 | model_id_raw = ds.id 22 | 23 | id_tokens = model_id_raw.split('.') 24 | 25 | return id_tokens[len(id_tokens)-1] 26 | 27 | logger.info(f'create timestamp: {create_timestamp}') 28 | if create_timestamp: 29 | timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') 30 | for ds in inp_datasets: 31 | ds.df = ds.df.withColumn('ingest_date', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) 32 | return [DataSet(id=resolve_data_set_id(ds), df=ds.df) for ds in inp_datasets] 33 | -------------------------------------------------------------------------------- /docs/access-management.md: -------------------------------------------------------------------------------- 1 | # Access management 2 | 3 | The access management concept is based on two separate mechanisms: 4 | 5 | 1. Tagging all produced data to control which groups should have access to data 6 | - This is controlled by the data producers, via the model YAML files 7 | - The data producers know their data best and can control which groups should have access (does it contain PII? Is 8 | it intended to be public or private, etc.) 9 | - the platform takes over this process and tags all produced data files based on the configuration in the YAML files 10 | 2. Managing groups of people (or services) who are allows to join those groups to gain access to the data. 11 | - IAM policies, which provide access to S3 data files which have been tagged as mentioned before have to be created 12 | manually (as of now) 13 | - please see `access/policy_template.json` as an example for providing access to files which have specific tags 14 | defined. 15 | - those policies can be attached to IAM groups to provide access to one or multiple combinations of access control 16 | tags 17 | - IAM users then can join and leave groups to gain access to the data, matching the policies assigned to those 18 | groups 19 | 20 | ## Technical implementation 21 | 22 | The S3 writer automatically applies the following tags to all data files written out to S3: 23 | 24 | - tags defined in the `model.yml` under `models..tags` are added to all output data files in the dataset's S3 25 | folder as is, using the tag's name and value without modification. 26 | - tags defined in the `model.yml` under `models..access` are added to all output data files in the dataset's S3 27 | folder as well, but the tag names are prefixed with `access_`, to have a clear distinction between access control tags 28 | and custom tags, every data producer can define without limitation. 29 | - Example: the access tag `confidentiality` with value `private` will be assigned as S3 tag `access_confidentiality` 30 | with value `private`. 31 | 32 | ## Limitations 33 | 34 | Based on the metadata defined in the model's YAML files, the processor will set S3 tags to all files written out to 35 | Amazon S3, found in the data dataset's "folder" (meaning all files, with the 36 | prefix `//`) 37 | 38 | Currently, only files written to S3 are supported to be tagged automatically. 39 | 40 | Access policies and group have to be created by the user manually and IAM users have to be assigned to IAM groups 41 | manually to actually manage access to the data. -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | ## Job processing chain 4 | 5 | ![architectural diagram](./data-product-processor-arch.png) -------------------------------------------------------------------------------- /docs/data-product-processor-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/docs/data-product-processor-arch.png -------------------------------------------------------------------------------- /docs/how-to/custom-dependencies.md: -------------------------------------------------------------------------------- 1 | # How to reference custom Spark dependencies? 2 | 3 | Sometimes you might need custom third party libraries for your aggregation logic. These can be added by creating a 4 | ```requirements.txt``` file in the root of your Data Product folder. In the following example we show, how to use 5 | Pydeequ (a third party analyzer and quality assurance library from Amazon): 6 | 7 | ```requirements.txt 8 | pydeequ 9 | ``` 10 | 11 | Pydeequ - in our example - is the python binding to the Deequ Scala implementation, that needs additional non-python ( 12 | Scala or Java) libraries to be added to the Spark cluster. 13 | This can be added via a ```config.ini``` file (also stored in the root of the data product). 14 | 15 | ```properties 16 | [spark jars] 17 | spark.jars.packages=com.amazon.deequ:deequ:1.2.2-spark-3.0 18 | spark.jars.excludes=net.sourceforge.f2j:arpack_combined_all 19 | ``` 20 | 21 | Once the pre-requisites are there, you can start using the new library in your custom logic: 22 | 23 | ```python 24 | from pyspark.sql.functions import concat, col, lit 25 | from driver.common import find_dataset_by_id 26 | from driver.task_executor import DataSet 27 | from typing import List 28 | from pyspark.sql import SparkSession, Row 29 | from pydeequ.analyzers import * 30 | 31 | 32 | def execute(inp_dfs: List[DataSet], spark_session: SparkSession): 33 | ds = find_dataset_by_id(inp_dfs, 'sample_product.sample_model') 34 | ds.df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name'))) 35 | 36 | analysis_result = AnalysisRunner(spark_session) 37 | .onData(ds.df) 38 | .addAnalyzer(Size()) 39 | .addAnalyzer(Completeness("b")) 40 | .run() 41 | 42 | 43 | analysis_result_df = AnalyzerContext.successMetricsAsDataFrame(spark_session, analysis_result) 44 | 45 | ds_model = DataSet(id='sample_model', df=ds.df) 46 | ds_analysis = DataSet(id='model_analysis', df=analysis_result_df) 47 | return [ds_model, ds_analysis] 48 | ``` 49 | 50 | Additionally you can create a custom initialisation file, called ```init_hook.py``` in the root folder of your data 51 | product. This file will give you control over the Spark environment and the data product processor environment as well. 52 | A feature that we can use to interact with the cluster configuration. 53 | 54 | ```python 55 | from typing import List, Dict 56 | from pyspark import SparkConf 57 | from driver.task_executor import DataSet 58 | 59 | 60 | def enrich_spark_conf(conf: SparkConf) -> SparkConf: 61 | conf.set("spark.sql.warehouse.dir", "some warehouse location") 62 | return conf 63 | 64 | 65 | def add_pre_processors() -> List[callable]: 66 | def my_custom_pre_processor(data_set: DataSet) -> DataSet: 67 | return data_set.df.filter(...) 68 | 69 | return [my_custom_pre_processor] 70 | 71 | 72 | def add_post_processors() -> List[callable]: 73 | def my_custom_post_processor(data_set: DataSet) -> DataSet: 74 | return data_set.df.filter(...) 75 | 76 | return [my_custom_post_processor] 77 | ``` 78 | 79 | **Please note:** all of the above methods are optional. The Spark configuration can also be influenced by the use of the 80 | ini file. 81 | 82 | #### Preparing your unit test to work with Pyspark custom configurations 83 | 84 | Create a file ```pytest.ini``` and add Spark options: 85 | 86 | ```properties 87 | [pytest] 88 | spark_options= 89 | spark.jars.packages=com.amazon.deequ:deequ:1.2.2-spark-3.0 90 | spark.jars.excludes=net.sourceforge.f2j:arpack_combined_all 91 | ``` 92 | -------------------------------------------------------------------------------- /docs/how-to/local-development.md: -------------------------------------------------------------------------------- 1 | # Setup of local development environment 2 | 3 | > **Note**: The subsequent steps assume an installation on ___MacOS/OSX___ 4 | 5 | ## 1) Installation of tools and dependencies 6 | 7 | ### Python 8 | Everything will be installed in virtual environment in your local project folder. 9 | 10 | ```bash 11 | python3 -m venv .venv 12 | source .venv/bin/activate 13 | pip install -r requirements-test.txt 14 | ``` 15 | 16 | ### Java 17 | 18 | Install openjdk and maven. 19 | 20 | ```bash 21 | brew tap homebrew/cask-versions 22 | brew update 23 | brew tap homebrew/cask 24 | brew tap adoptopenjdk/openjdk 25 | brew install --cask adoptopenjdk11 26 | brew install maven 27 | ``` 28 | 29 | ### Apache Spark 30 | 31 | Install spark dependencies: 32 | 33 | ```bash 34 | mkdir spark_deps 35 | cd spark_deps 36 | wget https://jdbc.postgresql.org/download/postgresql-42.2.23.jar 37 | ``` 38 | 39 | Install the AWS dependencies for Apache Hadoop: 40 | 41 | 1. check the current version of hadoop: ```ll -al .venv/lib/python3.9/site-packages/pyspark/jars |grep hadoop``` 42 | 2. create a POM file in the spark_deps folder (make sure the version field matches the current hadoop version): 43 | 44 | ```xml 45 | 46 | 4.0.0 47 | com.mycompany.app 48 | my-app 49 | 1 50 | 51 | 52 | org.apache.hadoop 53 | hadoop-aws 54 | 3.3.1 55 | 56 | 57 | 58 | ``` 59 | 60 | Then, run: 61 | 62 | ```bash 63 | mvn --batch-mode -f ./pom.xml -DoutputDirectory=./jars dependency:copy-dependencies 64 | mv jars/* . 65 | ``` 66 | 67 | ## 2) Test the installation 68 | 69 | > **Note:** Don't forget to switch the new virtual environment in your IDE too. 70 | 71 | Install in the local environment 72 | ```commandline 73 | pip install -e . 74 | ``` 75 | 76 | ### Local invocation of data-product-processor 77 | 78 | To test if the data-product-processor can be executed correctly, follow the subsequent steps. 79 | 80 | Alternatively you can run the whole solution from the command line: 81 | 82 | ```commandline 83 | data-product-processor \ 84 | --JOB_NAME "TEST" \ 85 | --product_path /tests/assets/integration \ 86 | --default_data_lake_bucket \ 87 | --aws_profile \ 88 | --aws_region 89 | ``` 90 | 91 | Optionally you might need to export Spark Home if the Spark environment is not found in your installation. 92 | 93 | ```commandline 94 | export SPARK_HOME="$(pwd)/.venv/lib/python3.9/site-packages/pyspark" 95 | ``` 96 | 97 | Run the tests from command line (while the virtual environment is activated): 98 | 99 | ```commandline 100 | pytest 101 | ``` 102 | 103 | ### Package creation 104 | 105 | Test whether the python package (wheel) can be build through which the data-product-processor is distributed. 106 | 107 | ```commandline 108 | pip install -U pip wheel setuptools 109 | python3 setup.py bdist_wheel 110 | ``` 111 | 112 | 113 | # Troubleshooting / common errors 114 | 115 | ## py4j 116 | 117 | ``` 118 | py4j.protocol.Py4JError: org.apache.spark.api.python.PythonUtils.getPythonAuthSocketTimeout does not exist in the JVM 119 | ``` 120 | Resolve through: 121 | ```commandline 122 | export PYTHONPATH="${SPARK_HOME}/python;${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip;${PYTHONPATH}" 123 | ``` 124 | 125 | ## Sfl4j not found 126 | 127 | ```commandline 128 | [NOT FOUND ] org.slf4j#slf4j-api;1.7.5!slf4j-api.jar 129 | ``` 130 | **Solution** 131 | Remove dir in .ivy2/cache, ivy2/jars and .m2/repository 132 | -------------------------------------------------------------------------------- /docs/how-to/transformation-logic.md: -------------------------------------------------------------------------------- 1 | # How to write and test custom aggregation logic? 2 | 3 | Each custom aggregation logic has the same anatomy: it receives a list of input DataSets (that contains the Spark 4 | DataFrame) 5 | and must produce at least one output DataSet with a Spark DataFrame inside. Everything in between is standard Python and 6 | PySpark. 7 | 8 | The example below receives one DataSet with the ID ```person_raw```, adds a new timestamp column if 9 | the ```create_timestamp``` 10 | property was defined in the ```product.yml```'s pipeline > tasks > logic > parameters section and concatenates the 11 | first_name and last_names columns into a full_name column. The very same DataFrame is packaged into two different 12 | DataSets, with two different models referred to in the id property, so that the processor can do some post-processing on 13 | the dataframes, that are defined in those models. 14 | 15 | ```python 16 | def execute(inp_dfs: List[DataSet], spark_session: SparkSession, create_timestamp=False): 17 | ds = find_dataset_by_id(inp_dfs, 'person_raw') 18 | 19 | if create_timestamp: 20 | timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') 21 | ds.df = ds.df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) 22 | 23 | df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name'))) 24 | 25 | ds_pub = DataSet(id='person_pub', df=df) 26 | ds_pii = DataSet(id='person_pii', df=df) 27 | 28 | return [ds_pub, ds_pii] 29 | ``` 30 | 31 | In the example above, it is mandatory to provide the ```inp_dfs``` and the ```spark_session``` parameters, because these 32 | are injected by the task executor. 33 | 34 | The DataSet class provides access to the Spark Data Frame, as well to the model and the product metadata structure. 35 | 36 | ```python 37 | @dataclass 38 | class DataSet: 39 | id: str 40 | df: DataFrame 41 | model: SimpleNamespace = None 42 | product: DataProduct = None 43 | ``` 44 | 45 | These can be referenced in each custom aggregation task code. 46 | 47 | Your custom aggregation logic is parametrised from the ```product.yml``` file's ```tasks``` section: 48 | 49 | ```yaml 50 | logic: 51 | module: tasks.custom_business_logic 52 | parameters: 53 | create_timestamp: false 54 | ``` 55 | 56 | ## Testing 57 | 58 | We recommend using the ```pytest``` framework for writing unit tests for your custom logic. 59 | 60 | ### 1) Create a virtual environment in root folder 61 | 62 | ```commandline 63 | python3 -m venv .venv 64 | source .venv/bin/activate 65 | ``` 66 | 67 | ### 2) Install data-product-processor 68 | ```commandline 69 | pip install data-product-processor 70 | ``` 71 | 72 | ### 3) Install python dependencies for test execution 73 | 74 | Create a ```requirements-test.txt``` file in the root folder of the data product with the following content: 75 | 76 | ```text 77 | pyspark 78 | pyspark-stubs 79 | pytest-spark 80 | pytest-mock 81 | pytest-helpers-namespace 82 | pytest-env 83 | pytest-cov 84 | pytest 85 | ``` 86 | 87 | Install them. 88 | ```commandline 89 | pip install -r requirements-test.txt 90 | ``` 91 | 92 | ### 4) Add tests 93 | 94 | Create a ```tests``` folder in your data product folder. 95 | 96 | ```commandline 97 | mkdir tests 98 | touch tests/__init__.py 99 | ``` 100 | Create a test configuration file called ```test_config.py``` 101 | with [fixtures](https://docs.pytest.org/en/6.2.x/fixture.html) (reusable, support functionality injected into your tests 102 | by the pytest framework). 103 | 104 | ```python 105 | 106 | from types import SimpleNamespace 107 | from pyspark.sql import DataFrame 108 | from pytest import fixture 109 | from pyspark.sql.types import ( 110 | StringType, 111 | StructField, 112 | StructType, 113 | IntegerType 114 | ) 115 | 116 | DEFAULT_BUCKET = 's3://test-bucket' 117 | 118 | 119 | @fixture 120 | def app_args() -> SimpleNamespace: 121 | args = SimpleNamespace() 122 | setattr(args, 'default_data_lake_bucket', DEFAULT_BUCKET) 123 | return args 124 | 125 | 126 | @fixture(scope='module') 127 | def person_schema() -> StructType: 128 | return StructType([ 129 | StructField('id', IntegerType(), False), 130 | StructField('first_name', StringType(), True), 131 | StructField('last_name', StringType(), True), 132 | StructField('age', IntegerType(), True), 133 | StructField('city', StringType(), True), 134 | StructField('gender', StringType(), True), 135 | ]) 136 | 137 | 138 | @fixture(scope='module') 139 | def person_df(spark_session, person_schema) -> DataFrame: 140 | return spark_session.createDataFrame([(1, "John", "Doe", 25, "Berlin", "male"), 141 | (2, "Jane", "Doe", 41, "Berlin", "female"), 142 | (3, "Maxx", "Mustermann", 30, "Berlin", "male") 143 | ], person_schema) 144 | ``` 145 | 146 | Next write your test function for your custom business logic in the ```test_custom_business_logic.py``` file: 147 | 148 | ```python 149 | from pyspark.sql import DataFrame 150 | 151 | 152 | def test_custom_logic(spark_session, person_df: DataFrame): 153 | data_source = DataSet(id='some_schema.some_table', df=person_df) 154 | results: List[DataSet] = tasks.custom_business_logic.execute([data_source], spark_session) 155 | for dataset in results: 156 | assert dataset.id == 'transformed_data_set' 157 | assert dataset.df.count() == person_df.count() 158 | dataset.df.show() 159 | dataset.df.describe() 160 | ``` 161 | 162 | You might want to run an end-to-end test, by wiring together the minimal structure of the data product processor: 163 | 164 | ```python 165 | from types import SimpleNamespace 166 | from driver import DataSet 167 | from driver.processors import schema_checker, constraint_processor, transformer_processor 168 | 169 | 170 | def test_end_to_end(spark_session, spark_context, person_df: DataFrame, app_args): 171 | product_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') 172 | 173 | def mock_input_handler(input_definition: SimpleNamespace): 174 | dfs = {"source_id": person_df} 175 | return dfs.get(input_definition.table) 176 | 177 | def mock_output_handler(dataset: DataSet): 178 | assert dataset.id == 'transformed_data_set' 179 | assert dataset.df.count() == person_df.count() 180 | dataset.df.show() 181 | dataset.df.describe() 182 | 183 | driver.init(spark_session) 184 | driver.register_data_source_handler('connection', mock_input_handler) 185 | driver.register_postprocessors(transformer_processor, schema_checker, constraint_processor) 186 | driver.register_output_handler('default', mock_output_handler) 187 | driver.register_output_handler('lake', mock_output_handler) 188 | driver.process_product(app_args, product_folder) 189 | ``` 190 | 191 | You can run your tests from your favourite editor (eg. Pycharm) or using the ```pytest``` command line. 192 | -------------------------------------------------------------------------------- /driver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .driver import process_product, init, install_dependencies 5 | from .task_executor import ( 6 | register_data_source_handler, 7 | register_preprocessors, 8 | register_postprocessors, 9 | register_output_handler, 10 | register_transformer, 11 | add_transformers 12 | ) 13 | from .core import ( 14 | DataSet, 15 | ConfigContainer 16 | ) 17 | from .common import read_csv, write_csv 18 | 19 | -------------------------------------------------------------------------------- /driver/aws/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /driver/aws/datalake_api.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import os 6 | from typing import List, Dict 7 | 8 | from driver.aws import providers 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Partition: 14 | def __init__(self, path_key): 15 | path_iterator = iter(os.path.split(path_key)) 16 | segments = next(pe for pe in path_iterator if pe).split('=') 17 | self.name = segments[0] 18 | self.value = segments[1] 19 | sub_partition = next(path_iterator, None) 20 | self.subpartitions = list() 21 | if sub_partition: 22 | o = Partition(sub_partition) 23 | self.subpartitions.append(o) 24 | 25 | def get_partition_chain(self, prefix: str, parent_key: str = None, parent_value: str = None) -> List[Dict[str, str]]: 26 | pchain = list() 27 | prepped_prefix = os.path.join(prefix, f'{self.name}={self.value}') 28 | pkeys = list() 29 | pkey_values = list() 30 | if parent_key and parent_value: 31 | pkeys.append(parent_key) 32 | pkey_values.append(parent_value) 33 | if len(self.subpartitions) > 0: 34 | for sp in self.subpartitions: 35 | pchain.extend(sp.get_partition_chain(prepped_prefix, parent_key=self.name, parent_value=self.value)) 36 | else: 37 | pkeys.append(self.name) 38 | pkey_values.append(self.value) 39 | pchain.append({'keys': pkeys, 'values': pkey_values, 'location': prepped_prefix}) 40 | return pchain 41 | 42 | 43 | def read_partitions(bucket: str, container_folder: str = None): 44 | s3 = providers.get_s3() 45 | rsp = s3.list_objects_v2(Bucket=bucket, Prefix=os.path.join(container_folder, '')) 46 | keys = set(os.path.dirname(k.get('Key')) for k in rsp.get('Contents')) 47 | prefix = rsp.get('Prefix') 48 | partition_keys = [p[len(prefix):] for p in keys if p != prefix.rstrip('/')] 49 | partitions = list() 50 | for p in partition_keys: 51 | partitions.append(Partition(p)) 52 | return partitions 53 | 54 | 55 | def tag_files(bucket: str, prefix: str, tags: dict): 56 | s3 = providers.get_s3() 57 | 58 | tags_s3 = [] 59 | for tag_name in tags.keys(): 60 | tags_s3.append({'Key': tag_name, 'Value': str(tags[tag_name])}) 61 | 62 | for key in find_files(bucket, prefix): 63 | s3.put_object_tagging(Bucket=bucket, Key=key, Tagging={'TagSet': tags_s3}) 64 | 65 | 66 | def find_files(bucket: str, prefix: str) -> List[str]: 67 | s3 = providers.get_s3() 68 | files = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) 69 | return [f['Key'] for f in files['Contents']] 70 | -------------------------------------------------------------------------------- /driver/aws/glue_api.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | import botocore 7 | from mypy_boto3_glue.type_defs import GetDatabasesResponseTypeDef, DatabaseTypeDef, GetTablesResponseTypeDef, \ 8 | TableTypeDef, TableInputTypeDef, StorageDescriptorTypeDef, ColumnTypeDef, DatabaseInputTypeDef 9 | from mypy_boto3_glue.client import Exceptions 10 | from driver.aws import providers 11 | from driver.aws.resolvers import resolve_table_input, resolve_partition_inputs, resolve_database 12 | from driver.task_executor import DataSet 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def drain_data_catalog(data_catalog_id: str): 18 | glue = providers.get_glue() 19 | try: 20 | get_tables_response: GetTablesResponseTypeDef = glue.get_tables(DatabaseName=data_catalog_id) 21 | for table in get_tables_response.get('TableList'): 22 | glue.delete_table(DatabaseName=data_catalog_id, Name=table.get('Name')) 23 | except Exception as enf: 24 | if enf.__class__.__name__ == 'EntityNotFoundException': 25 | logger.warning( 26 | f'Database {data_catalog_id} does not exists in the data catalog. No tables will be deleted.') 27 | 28 | 29 | def update_data_catalog(ds: DataSet): 30 | glue = providers.get_glue() 31 | logger.info(f'--> Updating the data catalog for data product [{ds.product_id}] and model [{ds.model.id}].') 32 | 33 | def upsert_database(): 34 | try: 35 | rsp: GetDatabasesResponseTypeDef = glue.get_database(Name=ds.product_id) 36 | # todo: update database with changes 37 | except Exception as enf: 38 | if enf.__class__.__name__ == 'EntityNotFoundException': 39 | # database does not exists yet 40 | logger.warning( 41 | f'Database {ds.product_id} does not exists in the data catalog ({str(enf)}). It is going to be created.') 42 | # todo: add permissions 43 | glue.create_database( 44 | DatabaseInput=resolve_database(ds)) 45 | else: 46 | raise enf 47 | 48 | def upsert_table(): 49 | try: 50 | rsp: GetTablesResponseTypeDef = glue.get_table(DatabaseName=ds.product_id, Name=ds.id) 51 | # todo: update table 52 | glue.delete_table(DatabaseName=ds.product_id, Name=ds.id) 53 | glue.create_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds)) 54 | # glue.update_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds)) 55 | except Exception as enf: # EntityNotFoundException 56 | # table not found 57 | if enf.__class__.__name__ == 'EntityNotFoundException': 58 | logger.warning( 59 | f'Table [{ds.id}] cannot be found in the catalog schmea [{ds.product_id}]. Table is going to be created.') 60 | glue.create_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds)) 61 | else: 62 | raise enf 63 | # rsp: GetTablesResponseTypeDef = glue.get_table(DatabaseName=ds.product_id, Name=ds.id) 64 | # todo: update partitions 65 | # todo: register with lakeformation 66 | 67 | def upsert_partitions(): 68 | # entries = resolve_partition_entries(ds) 69 | # rsp = glue.batch_update_partition(DatabaseName=ds.product_id, TableName=ds.model_id, Entries=entries) 70 | partition_inputs = resolve_partition_inputs(ds) 71 | if not partition_inputs: 72 | return 73 | rsp = glue.batch_create_partition(DatabaseName=ds.product_id, TableName=ds.id, 74 | PartitionInputList=partition_inputs) 75 | # rsp = glue.batch_update_partition(DatabaseName=ds.product_id, TableName=ds.id, 76 | # Entries=partition_inputs) 77 | if rsp.get('Errors'): 78 | raise Exception(f"Couldn't update the table [{ds.id}] with the partitions.") 79 | status_code = rsp.get('ResponseMetadata').get('HTTPStatusCode') 80 | logger.info(f'Partition upsert response with HTTP Status Code: {str(status_code)}') 81 | # todo: write a proper error handling here 82 | 83 | upsert_database() 84 | upsert_table() 85 | upsert_partitions() # todo: this is not yet an upsert (just in name but not in implementation) 86 | -------------------------------------------------------------------------------- /driver/aws/providers.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import traceback 6 | import boto3 7 | import mypy_boto3_glue 8 | from driver.core import ( 9 | Connection, 10 | ConnectionNotFoundException, 11 | DataProductTable, 12 | TableNotFoundException, 13 | ) 14 | 15 | __SESSION__ = None 16 | logger = logging.getLogger(__name__) 17 | 18 | def init( 19 | key_id: str = None, 20 | key_material: str = None, 21 | profile: str = None, 22 | region: str = None, 23 | ): 24 | global __SESSION__ 25 | if key_id and key_material and region: 26 | __SESSION__ = boto3.Session( 27 | aws_access_key_id=key_id, 28 | aws_secret_access_key=key_material, 29 | region_name=region, 30 | ) 31 | elif key_id and key_material and not region: 32 | __SESSION__ = boto3.Session( 33 | aws_access_key_id=key_id, aws_secret_access_key=key_material 34 | ) 35 | elif profile and region: 36 | __SESSION__ = boto3.Session(profile_name=profile, region_name=region) 37 | elif profile and not region: 38 | __SESSION__ = boto3.Session(profile_name=profile) 39 | elif region: 40 | __SESSION__ = boto3.Session(region_name=region) 41 | else: 42 | __SESSION__ = boto3.Session() 43 | logger.debug(f'boto session region: {__SESSION__.region_name}') 44 | # amongst others used to verify bucket ownership in interaction with s3 45 | global __AWS_ACCOUNT_ID__ 46 | sts = __SESSION__.client("sts") 47 | __AWS_ACCOUNT_ID__ = sts.get_caller_identity()["Account"] 48 | 49 | def get_session() -> boto3.Session: 50 | return __SESSION__ 51 | 52 | 53 | def get_aws_account_id() -> str: 54 | if not __AWS_ACCOUNT_ID__: 55 | raise Exception("Boto session is not initialized. Please call init first.") 56 | return __AWS_ACCOUNT_ID__ 57 | 58 | 59 | def get_glue() -> mypy_boto3_glue.GlueClient: 60 | if not get_session(): 61 | raise Exception("Boto session is not initialized. Please call init first.") 62 | return get_session().client("glue") 63 | 64 | 65 | def get_s3(): 66 | if not get_session(): 67 | raise Exception("Boto session is not initialized. Please call init first.") 68 | 69 | return get_session().client("s3") 70 | 71 | def describe_session(): 72 | boto_session = get_session() 73 | return f'| Profile: {boto_session.profile_name} | Region: {boto_session.region_name} | Access Key: {boto_session.get_credentials().access_key}' 74 | 75 | def connection_provider(connection_id: str) -> Connection: 76 | """ 77 | Returns a data connection object, that can be used to connect to databases. 78 | :param connection_id: 79 | :return: 80 | """ 81 | try: 82 | if not get_session(): 83 | raise Exception("Boto session is not initialized. Please call init first.") 84 | glue = get_session().client("glue") 85 | response = glue.get_connection(Name=connection_id, HidePassword=False) 86 | if "Connection" not in response: 87 | logger.error(f'Connection {connection_id} not found. Boto session: {describe_session()}. Connection request response: {response}') 88 | raise ConnectionNotFoundException( 89 | f"Connection [{connection_id}] could not be found." 90 | ) 91 | cprops = response.get("Connection").get("ConnectionProperties") 92 | logger.debug(f'Connection details: {response.get("Connection")}') 93 | native_host = cprops.get("JDBC_CONNECTION_URL")[len("jdbc:") :] 94 | logger.debug(f'native host definition: {native_host}') 95 | connection = Connection.parse_obj( 96 | { 97 | "name": connection_id, 98 | "host": native_host, 99 | "principal": cprops.get("USERNAME"), 100 | "credential": cprops.get("PASSWORD"), 101 | "type": native_host.split(":")[0], 102 | "ssl": cprops.get("JDBC_ENFORCE_SSL"), 103 | } 104 | ) 105 | return connection 106 | except Exception as e: 107 | logger.error(f'{type(e).__name__} exception received while retrieving the connection to the data source: {str(e)}). Boto session {describe_session()}.') 108 | logger.debug(f'Exception log: {traceback.format_exc()}') 109 | raise ConnectionNotFoundException( 110 | f"Connection [{connection_id}] could not be found. {str(e)}. Make sure you have the right region defined." 111 | ) 112 | 113 | 114 | def datalake_provider(product_id, table_id) -> DataProductTable: 115 | if not get_session(): 116 | raise Exception("Boto session is not initialized. Please call init first.") 117 | glue = get_session().client("glue") 118 | response = glue.get_table(DatabaseName=product_id, Name=table_id) 119 | if "Table" not in response: 120 | raise TableNotFoundException( 121 | f"Data Product Table [{product_id}.{table_id}] could not be found." 122 | ) 123 | table = DataProductTable.parse_obj( 124 | { 125 | "product_id": product_id, 126 | "table_id": table_id, 127 | "storage_location": response.get("Table") 128 | .get("StorageDescriptor") 129 | .get("Location"), 130 | } 131 | ) 132 | return table 133 | -------------------------------------------------------------------------------- /driver/aws/resolvers.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import os 5 | from typing import List, Dict 6 | from mypy_boto3_glue.type_defs import TableTypeDef, StorageDescriptorTypeDef, ColumnTypeDef, SerDeInfoTypeDef, \ 7 | BatchUpdatePartitionRequestEntryTypeDef, PartitionInputTypeDef, TableInputTypeDef, DatabaseInputTypeDef 8 | from pyspark.sql import DataFrame 9 | 10 | from driver.aws.datalake_api import Partition 11 | from driver.aws import datalake_api 12 | from driver.task_executor import DataSet 13 | from driver.util import filter_list_by_id, safe_get_property 14 | 15 | 16 | def resolve_partitions(ds: DataSet) -> List[ColumnTypeDef]: 17 | return [ColumnTypeDef(Name=p, Type=dict(ds.df.dtypes)[p]) for p in ds.partitions] 18 | 19 | 20 | def resolve_table_type(ds: DataSet) -> str: 21 | return 'EXTERNAL_TABLE' 22 | 23 | 24 | def resolve_table_parameters(ds: DataSet) -> Dict[str, str]: 25 | return { 26 | "classification": "parquet", 27 | "compressionType": "none", 28 | "objectCount": "1", 29 | "recordCount": str(ds.df.count()), 30 | "typeOfData": "file" 31 | } 32 | 33 | 34 | def resolve_input_format(ds: DataSet) -> str: 35 | formats = { 36 | 'parquet': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 37 | } 38 | return formats.get(ds.storage_format) 39 | 40 | 41 | def resolve_output_format(ds: DataSet) -> str: 42 | formats = { 43 | 'parquet': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 44 | } 45 | return formats.get(ds.storage_format) 46 | 47 | 48 | def resolve_compressed(ds: DataSet) -> bool: 49 | # return str(False).lower() 50 | return False 51 | 52 | 53 | def resolve_serde_info(ds: DataSet) -> SerDeInfoTypeDef: 54 | parquet = SerDeInfoTypeDef(SerializationLibrary='org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 55 | Parameters={'serialization.format': '1'}) 56 | serdes = { 57 | 'parquet': parquet 58 | } 59 | return serdes.get(ds.storage_format) 60 | 61 | 62 | def resolve_storage_descriptor(ds: DataSet, override_location: str = None) -> StorageDescriptorTypeDef: 63 | if override_location: 64 | path = f's3://{os.path.join(override_location, "")}' 65 | else: 66 | path = f"s3://{ds.dataset_storage_path.lstrip('/')}" 67 | return StorageDescriptorTypeDef( 68 | Location=path, 69 | InputFormat=resolve_input_format(ds), 70 | OutputFormat=resolve_output_format(ds), 71 | Compressed=resolve_compressed(ds), 72 | NumberOfBuckets=-1, # todo: check how to calculate this. 73 | SerdeInfo=resolve_serde_info(ds), 74 | Parameters=resolve_table_parameters(ds), # todo: partition size 75 | Columns=resolve_columns(ds) 76 | ) 77 | 78 | 79 | def resolve_columns(ds: DataSet) -> List[ColumnTypeDef]: 80 | def lookup(column_name): 81 | if not hasattr(ds.model, 'columns'): 82 | return str() 83 | model_column = filter_list_by_id(ds.model.columns, column_name) 84 | if hasattr(model_column, 'name'): 85 | return f"{safe_get_property(model_column, 'name')}: {safe_get_property(model_column, 'description')}" 86 | else: 87 | return str() 88 | 89 | return [ColumnTypeDef(Name=cn, Type=ct, Comment=lookup(cn)) for cn, ct in ds.df.dtypes if cn not in ds.partitions] 90 | 91 | 92 | def resolve_table(ds: DataSet) -> TableTypeDef: 93 | return TableTypeDef( 94 | Name=ds.model_name, 95 | DatabaseName=ds.product_id, 96 | Description=ds.model_description, 97 | Owner=ds.product_owner, 98 | PartitionKeys=resolve_partitions(ds), 99 | TableType=resolve_table_type(ds), 100 | Parameters=resolve_table_parameters(ds), 101 | StorageDescriptor=resolve_storage_descriptor(ds) 102 | ) 103 | 104 | 105 | def resolve_table_input(ds: DataSet) -> TableInputTypeDef: 106 | return TableInputTypeDef( 107 | Name=ds.id, 108 | Description=f'{ds.model_name}: {ds.model_description}', 109 | Owner=ds.product_owner or str(), 110 | PartitionKeys=resolve_partitions(ds), 111 | TableType='EXTERNAL_TABLE', 112 | Parameters=resolve_table_parameters(ds), 113 | StorageDescriptor=resolve_storage_descriptor(ds) 114 | ) 115 | 116 | 117 | def resolve_partition_input(partition_location: str, partition_values: list, ds: DataSet) -> PartitionInputTypeDef: 118 | return PartitionInputTypeDef( 119 | Values=partition_values, 120 | StorageDescriptor=resolve_storage_descriptor(ds, override_location=partition_location), 121 | Parameters=resolve_table_parameters(ds), 122 | ) 123 | 124 | 125 | def reshuffle_partitions(prefix: str, partitions: List[Partition]) -> dict: 126 | partition_list = list() 127 | partition_dict = dict() 128 | for po in partitions: 129 | partition_list.extend(po.get_partition_chain(prefix=prefix)) 130 | for pdict in partition_list: 131 | # if pdict.get('location') not in ['glue-job-test-destination-bucket/person/gender=Female', 132 | # 'glue-job-test-destination-bucket/person/gender=Male']: 133 | # #todo: remove this ugly hack 134 | partition_dict[pdict.get('location')] = { 135 | 'keys': pdict.get('keys'), 136 | 'values': pdict.get('values') 137 | } 138 | return partition_dict 139 | 140 | 141 | def resolve_partition_inputs(ds: DataSet, format_for_update: bool = False) -> List[PartitionInputTypeDef]: 142 | bucket = ds.storage_location.lstrip('/').split('/')[0] 143 | folder = '/'.join(ds.dataset_storage_path.lstrip('/').split('/')[1:]) 144 | ps: List[Partition] = datalake_api.read_partitions(bucket=bucket, container_folder=folder) 145 | pdict = reshuffle_partitions(os.path.join(bucket, folder), ps) 146 | partition_defs = list() 147 | for k, v in pdict.items(): 148 | partition_values = v.get('values') 149 | if format_for_update: 150 | entry = {'PartitionValueList': v.get('values'), 151 | 'PartitionInput': resolve_partition_input(partition_location=k, partition_values=partition_values, 152 | ds=ds)} 153 | partition_defs.append(entry) 154 | else: 155 | partition_defs.append( 156 | resolve_partition_input(partition_location=k, partition_values=partition_values, ds=ds)) 157 | return partition_defs 158 | 159 | 160 | def resolve_partition_entries(ds: DataSet) -> List[BatchUpdatePartitionRequestEntryTypeDef]: 161 | partition_defs = list() 162 | bucket = ds.storage_location.lstrip('/').split('/')[0] 163 | folder = '/'.join(ds.dataset_storage_path.lstrip('/').split('/')[1:]) 164 | ps: List[Partition] = datalake_api.read_partitions(bucket=bucket, container_folder=folder) 165 | pdict = reshuffle_partitions(bucket, ps) 166 | for k, v in pdict.items(): 167 | partition_defs.append(BatchUpdatePartitionRequestEntryTypeDef( 168 | PartitionValueList=v.get('values'), 169 | PartitionInput=resolve_partition_input(partition_location=k, partition_values=v.get('values'), ds=ds) 170 | )) 171 | return partition_defs 172 | 173 | 174 | def resolve_database(ds: DataSet) -> DatabaseInputTypeDef: 175 | return DatabaseInputTypeDef(Name=ds.product_id, Description=ds.product.description or str()) 176 | -------------------------------------------------------------------------------- /driver/common.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import List 5 | 6 | from pyspark.sql import DataFrame 7 | from pyspark.sql.types import StructType 8 | from driver import driver 9 | from driver.task_executor import DataSet 10 | 11 | 12 | def find_dataset_by_id(dss: List[DataSet], dataset_id): 13 | return next(iter([ds for ds in dss if ds.id == dataset_id]), None) 14 | 15 | 16 | def remap_schema(ds: DataFrame) -> List[StructType]: 17 | schema_fields = list() 18 | for col in ds.model.columns: 19 | if hasattr(col, 'transform') and 'skip' in [t.type for t in col.transform]: 20 | continue 21 | nullable = True 22 | if hasattr(col, 'constraints'): 23 | nullable = 'not_null' not in [c.type for c in col.constraints] 24 | schema_fields.append({'metadata': {}, 'name': col.id, 'type': col.type, 'nullable': nullable}) 25 | return StructType.fromJson({'fields': schema_fields, 'type': 'struct'}) 26 | 27 | 28 | def read_csv(path: str) -> DataFrame: 29 | return ( 30 | driver.get_spark().read 31 | .format("csv") 32 | .option("mode", "DROPMALFORMED") 33 | .option("header", "true") 34 | .load(path)) 35 | 36 | 37 | def write_csv(df: DataFrame, output_path: str, buckets=3) -> None: 38 | df.coalesce(buckets).write.format("csv").mode("overwrite").options(header="true").save( 39 | path=output_path) 40 | -------------------------------------------------------------------------------- /driver/core.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from urllib.parse import urlparse 5 | from botocore.client import logger 6 | from jsonschema import validate, ValidationError 7 | import os 8 | from types import SimpleNamespace 9 | from dataclasses import dataclass 10 | from pyspark.sql import DataFrame 11 | from enum import Enum 12 | from pydantic import ( 13 | BaseModel, 14 | AnyUrl, 15 | SecretStr, 16 | conint, 17 | validator, root_validator, parse_obj_as, ValidationError, error_wrappers, Field) 18 | from typing import Dict, List, Tuple, Any, TypeVar, Union 19 | from pydantic import AnyUrl 20 | from driver import util 21 | 22 | Scalar = TypeVar('Scalar', int, float, bool, str) 23 | 24 | class ConfigContainer(SimpleNamespace): 25 | def __init__(self, **kwargs): 26 | super().__init__(**kwargs) 27 | # for key, value in dictionary.items(): 28 | # if isinstance(value, dict): 29 | # self.__setattr__(key, ConfigContainer(value)) 30 | # else: 31 | # self.__setattr__(key, value) 32 | 33 | def __getattribute__(self, value): 34 | try: 35 | return super().__getattribute__(value) 36 | except AttributeError: 37 | # super().__setattr__(value, SimpleNamespace()) 38 | return super().__getattribute__(value) 39 | 40 | @dataclass 41 | class DataProduct: 42 | id: str 43 | description: str = None 44 | owner: str = None 45 | 46 | 47 | @dataclass 48 | class DataSet: 49 | id: str 50 | df: DataFrame 51 | model: ConfigContainer = None 52 | product: DataProduct = None 53 | 54 | @classmethod 55 | def find_by_id(cls, dataset_list, ds_id): 56 | return next(iter([m for m in dataset_list if m.id == ds_id]), None) 57 | 58 | @property 59 | def partitions(self) -> List[str]: 60 | if self.storage_options and hasattr(self.storage_options, 'partition_by'): 61 | if isinstance(self.storage_options.partition_by, str): 62 | return [self.storage_options.partition_by] 63 | else: 64 | return [p for p in self.storage_options.partition_by] 65 | else: 66 | return list() 67 | 68 | @property 69 | def storage_location(self) -> (str | None): 70 | if util.check_property(self, 'model.storage.location'): 71 | return self.model.storage.location 72 | else: 73 | return None 74 | 75 | @storage_location.setter 76 | def storage_location(self, path: str): 77 | if not self.model: 78 | raise Exception("There's no model on the dataset, so location cannot be set yet.") 79 | elif not hasattr(self.model, 'storage'): 80 | storage = ConfigContainer() 81 | setattr(storage, 'location', path) 82 | setattr(self.model, 'storage', storage) 83 | elif not hasattr(self.model.storage, 'location'): 84 | setattr(self.model.storage, 'location', path) 85 | else: 86 | self.model.storage.location = path 87 | 88 | @property 89 | def path(self) -> str: 90 | if self.id is None: 91 | raise Exception(f'Can not construct data set path because product id is not defined.') 92 | if not self.storage_location: 93 | raise Exception(f'The data set storage location is not set for dataset id: {self.id}.') 94 | return f"{self.product.id}/{self.id}" 95 | 96 | @property 97 | def dataset_storage_path(self) -> str: 98 | return f'{self.storage_location}/{self.path}' 99 | 100 | @property 101 | def storage_type(self) -> str: 102 | if self.model and hasattr(self.model, 'storage'): 103 | return self.model.storage.type 104 | else: 105 | return 'default' 106 | 107 | @property 108 | def storage_format(self) -> (str | None): 109 | if self.model and hasattr(self.model, 'storage'): 110 | return self.model.storage.format if hasattr(self.model.storage, 'format') else None 111 | else: 112 | return None 113 | 114 | @property 115 | def storage_options(self) -> (ConfigContainer | None): 116 | if self.model and hasattr(self.model, 'storage') and hasattr(self.model.storage, 'options'): 117 | return self.model.storage.options 118 | else: 119 | return None 120 | 121 | @property 122 | def product_id(self) -> (str | None): 123 | return self.product.id if self.product else None 124 | 125 | @product_id.setter 126 | def product_id(self, p_id: str) -> None: 127 | if self.product: 128 | self.product.id = p_id 129 | else: 130 | self.product = DataProduct(id=p_id) 131 | 132 | @property 133 | def product_description(self) -> str: 134 | return self.product.description if self.product else None 135 | 136 | @property 137 | def product_owner(self) -> str: 138 | return self.product.owner if self.product else None 139 | 140 | @property 141 | def tags(self) -> dict: 142 | if not hasattr(self, 'model') or not hasattr(self.model, 'tags'): 143 | return dict() 144 | if self.id is None: 145 | raise Exception(f'Can not construct tags, id is not defined.') 146 | return self.model.tags.__dict__ 147 | 148 | @property 149 | def access_tags(self) -> dict: 150 | if not hasattr(self, 'model') or not hasattr(self.model, 'access'): 151 | return dict() 152 | if self.id is None: 153 | raise Exception(f'Can not construct tags, id is not defined.') 154 | return self.model.access.__dict__ 155 | 156 | @property 157 | def all_tags(self) -> dict: 158 | if self.id is None: 159 | raise Exception(f'Can not construct tags, id is not defined.') 160 | return {**self.tags, **{'access_' + k: v for k, v in self.access_tags.items()}} 161 | 162 | @property 163 | def model_name(self) -> str: 164 | return self.model.name if hasattr(self, 'model') and hasattr(self.model, 'name') else self.id 165 | 166 | @property 167 | def model_description(self) -> str: 168 | return self.model.description if hasattr(self, 'model') and hasattr(self.model, 'description') else str() 169 | 170 | 171 | class SchemaValidationException(Exception): 172 | def __init__(self, message: str, data_set: DataSet): 173 | self.data_set = data_set 174 | super().__init__(message) 175 | 176 | 177 | class ValidationException(Exception): 178 | def __init__(self, message: str): 179 | super().__init__(message) 180 | 181 | 182 | class ConnectionNotFoundException(Exception): 183 | pass 184 | 185 | 186 | class TableNotFoundException(Exception): 187 | pass 188 | 189 | 190 | class JobExecutionException(Exception): 191 | pass 192 | 193 | 194 | class ProcessorChainExecutionException(Exception): 195 | pass 196 | 197 | 198 | class ResolverException(Exception): 199 | pass 200 | 201 | 202 | class LocationDsn(AnyUrl): 203 | allowed_schemes = {'datastore', 'connection'} 204 | user_required = False 205 | 206 | 207 | class PostgresDsn(AnyUrl): 208 | allowed_schemes = {'postgres', 'postgresql'} 209 | user_required = False 210 | 211 | 212 | class JdbcDsn(AnyUrl): 213 | allowed_schemes = {'jdbc', 'jdbc'} 214 | user_required = False 215 | 216 | 217 | class MysqlDsn(AnyUrl): 218 | allowed_schemes = {'mysql', 'mysql'} 219 | user_required = False 220 | 221 | 222 | class IOType(str, Enum): 223 | model = 'model' 224 | connection = 'connection' 225 | file = 'file' 226 | 227 | 228 | class ArtefactType(str, Enum): 229 | model = 'model' 230 | product = 'product' 231 | 232 | 233 | class ConnectionType(str, Enum): 234 | jdbc = 'jdbc' 235 | postgresql = 'postgresql' 236 | redshift = 'redshift' 237 | mysql = 'mysql' 238 | mariadb = 'mariadb' 239 | mongodb = 'mongodb' 240 | s3 = 's3' 241 | csv = 'csv' 242 | parquet = 'parquet' 243 | 244 | @classmethod 245 | def is_file(cls, conn_type: 'ConnectionType'): 246 | return conn_type in [ConnectionType.csv, ConnectionType.parquet, ConnectionType.s3] 247 | 248 | 249 | url_parsers = { 250 | ConnectionType.postgresql: PostgresDsn, 251 | ConnectionType.jdbc: JdbcDsn 252 | } 253 | 254 | 255 | class Connection(BaseModel): 256 | name: str 257 | principal: Union[str, None] 258 | credential: Union[SecretStr, None] 259 | host: str 260 | port: Union[conint(lt=65535), None] 261 | db_name: Union[str, None] 262 | ssl: bool = False 263 | type: ConnectionType 264 | timeout: int = 3600 265 | batch_size: int = 10000 266 | meta_data: Dict[str, Scalar] = {} 267 | 268 | class Config: 269 | validate_assignment = True 270 | 271 | @classmethod 272 | def is_port_required(cls, conn_type: Union[ConnectionType, str]): 273 | if isinstance(conn_type, str): 274 | conn_type = ConnectionType(conn_type) 275 | return not ConnectionType.is_file(conn_type) 276 | 277 | @classmethod 278 | def is_jdbc_supported(cls, conn_type: Union[ConnectionType, str]): 279 | return Connection.is_port_required(conn_type) 280 | 281 | @classmethod 282 | def is_db_name_required(cls, conn_type: Union[ConnectionType, str]): 283 | return Connection.is_port_required(conn_type) 284 | 285 | @classmethod 286 | def is_userinfo_required(cls, conn_type: Union[ConnectionType, str]): 287 | return Connection.is_port_required(conn_type) 288 | 289 | @classmethod 290 | def fill_url_contained_values(cls, values: dict, ctype: Union[ConnectionType, str]): 291 | def strip_path(string: str): 292 | return string.strip('/') 293 | 294 | validable_keys = ['principal', 'credential', 'port', 'db_name'] 295 | autofill_checkers = { 296 | 'port': Connection.is_port_required, 297 | 'principal': Connection.is_userinfo_required, 298 | 'credential': Connection.is_userinfo_required, 299 | 'db_name': Connection.is_db_name_required, 300 | } 301 | url_property_map = { 302 | 'port': ('port', None), 303 | 'host': ('host', None), 304 | 'principal': ('user', None), 305 | 'credential': ('password', None), 306 | 'db_name': ('path', strip_path) 307 | } 308 | none_valued_keys = [k for k in values.keys() if not values.get(k)] 309 | values_keys = set(list(values.keys()) + none_valued_keys) 310 | vk = set(validable_keys) 311 | missing_keys = vk.difference(values_keys) 312 | parsable_keys = [] 313 | for k in missing_keys: 314 | if autofill_checkers.get(k)(ctype): 315 | parsable_keys.append(k) 316 | else: 317 | values[k] = None 318 | if len(parsable_keys) == 0: 319 | return 320 | url_parser = url_parsers.get(ctype, AnyUrl) 321 | try: 322 | url: AnyUrl = parse_obj_as(url_parser, values.get('host')) 323 | for pk in parsable_keys: 324 | func_name, converter = url_property_map.get(pk) 325 | value = getattr(url, func_name) 326 | if not value: 327 | raise ValueError(f'The field {pk} is required and not provided in the url or directly.') 328 | if converter: 329 | values[pk] = converter(value) 330 | else: 331 | values[pk] = value 332 | except ValueError as verr: 333 | raise verr 334 | except TypeError as tep: 335 | raise ValueError( 336 | f'Programming error at Connection Validation: {str(tep)}. ' 337 | f'Function name for property to be invoked on URL of type {type(url)}: {func_name}') 338 | except Exception as ex: 339 | raise ValueError( 340 | f'When one of the following fields is missing {validable_keys}, ' 341 | f'the $host URL must include its value; {str(ex)}') 342 | 343 | def get_native_connection_url(self, generate_creds=True) -> str: 344 | url_parser = url_parsers.get(self.type, AnyUrl) 345 | try: 346 | url: AnyUrl = parse_obj_as(url_parser, self.host) 347 | if Connection.is_userinfo_required(self.type): 348 | user = url.user or self.principal 349 | password = url.password or self.credential.get_secret_value() 350 | if Connection.is_db_name_required(self.type): 351 | path = url.path or f'/{self.db_name}' 352 | if Connection.is_port_required(self.type): 353 | port = url.port or self.port 354 | if generate_creds: 355 | return AnyUrl.build(scheme=url.scheme, user=user, password=password, host=url.host, port=port, 356 | path=path) 357 | else: 358 | return AnyUrl.build(scheme=url.scheme, host=url.host, port=port, path=path) 359 | except (error_wrappers.ValidationError, ValidationError): 360 | # not a url format 361 | passwd = self.credential.get_secret_value() if self.credential else '' 362 | userinfo = f'{self.principal}:{passwd}@' if Connection.is_userinfo_required(self.type) else '' 363 | host = self.host.strip('/') if self.host else '' 364 | port = f':{self.port}' if Connection.is_port_required(self.type) else '' 365 | db_path = f'/{self.db_name}' if Connection.is_db_name_required(self.type) else '' 366 | return f'{str(self.type.value)}://{userinfo}{host}{port}{db_path}' 367 | 368 | def get_jdbc_connection_url(self, generate_creds=True) -> str: 369 | if Connection.is_jdbc_supported(self.type): 370 | return f'jdbc:{self.get_native_connection_url(generate_creds)}' 371 | else: 372 | raise AssertionError(f"The connection {self.type.value} doesn't support JDBC.") 373 | 374 | @root_validator(pre=True) 375 | def check_host_url_dependent_fields(cls, values: dict): 376 | connection_type = values.get('type') 377 | host = values.get('host') 378 | if not host or not connection_type: 379 | raise ValueError('The host and the connection type must be defined.') 380 | Connection.fill_url_contained_values(values, connection_type) 381 | return values 382 | 383 | 384 | class DataProductTable(BaseModel): 385 | product_id: str 386 | table_id: str 387 | storage_location: str 388 | 389 | @property 390 | def storage_location_s3a(self): 391 | return self.storage_location.replace('s3://', 's3a://') 392 | 393 | 394 | def resolve_data_set_id(io_def: ConfigContainer) -> str: 395 | def xtract_domain(s): 396 | if '.' in s: 397 | domain_elements = s.rsplit('.') 398 | return domain_elements[len(domain_elements) - 1] 399 | else: 400 | return s 401 | 402 | if io_def.type == IOType.model: 403 | model_url = getattr(io_def, io_def.type) 404 | return xtract_domain(model_url) 405 | elif io_def.type == IOType.connection: 406 | return xtract_domain(io_def.model) if hasattr(io_def, 'model') else xtract_domain(io_def.table) 407 | elif io_def.type == IOType.file: 408 | if hasattr(io_def, IOType.model.name): 409 | return xtract_domain(getattr(io_def, IOType.model.name)) 410 | else: 411 | parsed_file = urlparse(io_def.file) 412 | filename = os.path.basename(parsed_file.path) 413 | return filename.rsplit('.')[0] 414 | else: 415 | raise ConnectionNotFoundException(f'The IO Type {io_def.type} is not supported.') 416 | 417 | 418 | def resolve_data_product_id(io_def: ConfigContainer) -> str: 419 | if io_def.type == IOType.model: 420 | return getattr(io_def, io_def.type).rsplit('.')[0] 421 | elif io_def.type == IOType.connection: 422 | return getattr(io_def, 'table').rsplit('.')[0] 423 | -------------------------------------------------------------------------------- /driver/driver.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import sys, os 6 | import traceback 7 | 8 | from pyspark.sql import SparkSession 9 | from driver import task_executor, packager 10 | from .packager import ziplib 11 | from .util import compile_models, compile_product 12 | 13 | __SPARK__: SparkSession = None 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def get_spark() -> SparkSession: 18 | if __SPARK__: 19 | return __SPARK__ 20 | else: 21 | raise RuntimeError('Spark Session is not created yet. Call init() first.') 22 | 23 | 24 | def get_or_create_session(config=None) -> SparkSession: # pragma: no cover 25 | """Build spark session for jobs running on cluster.""" 26 | spark = SparkSession.builder.appName(__name__) \ 27 | .config(conf=config) \ 28 | .enableHiveSupport() \ 29 | .getOrCreate() 30 | 31 | return spark 32 | 33 | 34 | def init(spark_session: SparkSession = None, spark_config=None): 35 | global __SPARK__ 36 | if not spark_session: 37 | logger.info('creating a new Spark session.') 38 | __SPARK__ = get_or_create_session(spark_config) 39 | else: 40 | logger.info('returning already existing Spark session.') 41 | __SPARK__ = spark_session 42 | # sc = __SPARK__.sparkContext 43 | # sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") 44 | 45 | 46 | def install_dependencies(product_path: str): 47 | new_packages = packager.install_dependencies(product_path) 48 | if new_packages: 49 | logger.info(f'packaging up the following new dependencies {new_packages.keys()}') 50 | for new_pack_name in new_packages.keys(): 51 | zipfile = ziplib(new_packages.get(new_pack_name), new_pack_name) 52 | logger.info(f'-----> installing {zipfile}') 53 | get_spark().sparkContext.addPyFile(zipfile) 54 | logger.debug('=> Dependencies are installed.') 55 | 56 | 57 | def process_product(args, product_path: str): 58 | try: 59 | product = compile_product(product_path, args) 60 | models = compile_models(product_path, product) 61 | for task in product.pipeline.tasks: 62 | task_executor.execute(product, task, models, product_path) 63 | except Exception as e: 64 | traceback.print_exc() 65 | logger.error(f"Couldn't execute job due to >> {type(e).__name__}: {str(e)}") 66 | sys.exit(-1) 67 | -------------------------------------------------------------------------------- /driver/io_handlers.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import os 6 | from .core import ConfigContainer 7 | from urllib.parse import urlparse 8 | 9 | from pyspark.sql import DataFrame, DataFrameWriter 10 | from driver.aws import glue_api, datalake_api 11 | from driver.core import Connection, resolve_data_set_id, resolve_data_product_id 12 | from driver.driver import get_spark 13 | from driver.task_executor import DataSet 14 | 15 | __CONN_PROVIDER__ = None 16 | __DATA_PRODUCT_PROVIDER__ = None 17 | 18 | from driver.util import check_property 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def init(connection_provider: callable, data_product_provider: callable): 24 | global __CONN_PROVIDER__, __DATA_PRODUCT_PROVIDER__ 25 | __CONN_PROVIDER__ = connection_provider 26 | __DATA_PRODUCT_PROVIDER__ = data_product_provider 27 | 28 | 29 | jdbc_drivers = { 30 | 'postgresql': 'org.postgresql.Driver', 31 | 'mysql': 'com.mysql.jdbc' 32 | } 33 | 34 | 35 | def connection_input_handler(props: ConfigContainer) -> DataFrame: 36 | connection: Connection = __CONN_PROVIDER__(props.connection) 37 | logger.info(f'using input conection: {connection.get_jdbc_connection_url(generate_creds=False)}') 38 | jdbcDF = ( 39 | get_spark() 40 | .read.format("jdbc") 41 | .option("url", connection.get_jdbc_connection_url(generate_creds=False)) 42 | .option("dbtable", props.table) 43 | .option("user", connection.principal) 44 | .option("password", connection.credential.get_secret_value()) 45 | .option("driver", jdbc_drivers.get(connection.type.name)) 46 | .option("ssl", connection.ssl) 47 | .option("sslmode", "require") 48 | .load() 49 | ) 50 | return jdbcDF 51 | 52 | 53 | def file_input_handler(props: ConfigContainer) -> DataFrame: 54 | def get_type(): 55 | return props.options.type or 'parquet' 56 | 57 | def get_separator(): 58 | return props.options.separator or ',' 59 | 60 | def get_infer_schema(): 61 | return props.options.infer_schema or 'false' 62 | 63 | def get_header(): 64 | return props.options.header or 'true' 65 | 66 | parsed = urlparse(props.file) 67 | scheme = 's3a' if parsed.scheme == 's3' else parsed.scheme 68 | if parsed.scheme: 69 | location = f'{scheme}://{parsed.netloc}{parsed.path}' 70 | else: 71 | location = f'{parsed.path}' 72 | logger.info(f'-> [File Input Handler]: reading from {location}') 73 | if hasattr(props, 'options'): 74 | df = get_spark().read.load(location, format=get_type(), sep=get_separator(), 75 | inferSchema=get_infer_schema(), header=get_header()) 76 | else: 77 | df = get_spark().read.load(location) 78 | return df 79 | 80 | 81 | def lake_input_handler(io_def: ConfigContainer) -> DataFrame: 82 | prod_id = resolve_data_product_id(io_def) 83 | model_id = resolve_data_set_id(io_def) 84 | data_product_table = __DATA_PRODUCT_PROVIDER__(prod_id, model_id) 85 | df = get_spark().read.parquet(data_product_table.storage_location_s3a) 86 | return df 87 | 88 | 89 | def file_output_handler(ds: DataSet, options: ConfigContainer): 90 | pass 91 | 92 | 93 | def resolve_compression(ds: DataSet): 94 | # todo: parse this into an enum 95 | # none, uncompressed, snappy, gzip, lzo, brotli, lz4, 96 | if check_property(ds, 'model.storage.options.compression'): 97 | return ds.model.storage.options.compression 98 | else: 99 | return 'snappy' 100 | 101 | 102 | def resolve_coalesce(ds: DataSet): 103 | if check_property(ds, 'model.storage.options.coalesce'): 104 | return ds.model.storage.options.coalesce 105 | else: 106 | return 2 107 | 108 | 109 | def resolve_header(ds: DataSet): 110 | if check_property(ds, 'model.storage.options.skip_first_row'): 111 | return ds.model.storage.options.skip_first_row 112 | else: 113 | return 'true' 114 | 115 | 116 | def lake_output_handler(ds: DataSet): 117 | output = f"{'s3a://'}{ds.dataset_storage_path.lstrip('/')}" 118 | logging.info(f'-> [Lake Output Handler]: writing data product to: {output}') 119 | ds.df.coalesce(resolve_coalesce(ds)).write \ 120 | .partitionBy(*ds.partitions or []) \ 121 | .format(ds.storage_format) \ 122 | .mode('overwrite') \ 123 | .option('header', resolve_header(ds)) \ 124 | .option('compression', resolve_compression(ds)) \ 125 | .save(output) 126 | 127 | datalake_api.tag_files(ds.storage_location, ds.path, ds.all_tags) 128 | 129 | # print(f'# partitions after write {ds.df.rdd.getNumPartitions()}') 130 | # todo: recheck coalesce value 131 | # todo: add parquet compression support / the glue catalog needs it too 132 | # todo: add bucket support & also to the glue catalog 133 | glue_api.update_data_catalog(ds) 134 | -------------------------------------------------------------------------------- /driver/packager.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import os, sys, zipfile, itertools 5 | from typing import List, Dict 6 | from pip._vendor import pkg_resources 7 | 8 | 9 | def install_pip_package(packages: list): 10 | from pip._internal.commands import create_command 11 | install = create_command('install', isolated=False) 12 | install.main(packages) 13 | 14 | 15 | def ziplib(dist_path, package_name) -> str: 16 | libpath = os.path.dirname(os.path.join(dist_path, package_name)) 17 | zippath = f'{package_name}.zip' 18 | zf = zipfile.PyZipFile(zippath, mode='w') 19 | try: 20 | zf.debug = 3 21 | zf.writepy(libpath) 22 | return zippath 23 | finally: 24 | zf.close() 25 | 26 | 27 | def install_dependencies(product_path: str) -> Dict: 28 | """ Collects requirements from a requirements from the data product file, 29 | installs the dependencies and returns a dictionary with all installed packages and their path. 30 | """ 31 | def collect_packages() -> set: 32 | ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries) 33 | eks = ws.entry_keys 34 | return set(itertools.chain(*[eks.get(k) for k in eks.keys()])) 35 | 36 | def find_path_for_package(package_name): 37 | ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries) 38 | eks = ws.entry_keys 39 | return next(iter([path for path in eks.keys() if package_name in eks.get(path)]), None) 40 | 41 | #todo: review and remove the one below 42 | 43 | # def collect_deps(package_name: str): 44 | # def merge_reqs(package: pkg_resources.DistInfoDistribution): 45 | # return_set = set({package.project_name}) 46 | # required_deps: List[pkg_resources.Requirement] = p.requires() 47 | # required_pnames = [r.project_name for r in required_deps] 48 | # for rpack in required_pnames: 49 | # return_set.update(merge_reqs(rpack)) 50 | # return return_set 51 | # ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries) 52 | # package: pkg_resources.DistInfoDistribution = ws.by_key[package_name] 53 | # return merge_reqs(package) 54 | 55 | requirements = os.path.join(product_path, 'requirements.txt') 56 | if os.path.isfile(requirements): 57 | before = collect_packages() 58 | with open(requirements) as f: 59 | lines = [line.rstrip('\n') for line in f] 60 | install_pip_package(lines) 61 | after = collect_packages() 62 | delta_packages = after - before 63 | return_packs = dict() 64 | for delta_pack in delta_packages: 65 | return_packs[delta_pack] = find_path_for_package(delta_pack) 66 | return return_packs 67 | -------------------------------------------------------------------------------- /driver/processors.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import hashlib 5 | import logging 6 | import re 7 | from datetime import datetime, timedelta 8 | from typing import List 9 | 10 | from driver import common 11 | from pyspark.sql import DataFrame, Window 12 | from pyspark.sql.functions import col, lit, udf, hash, to_date, row_number 13 | from pyspark.sql.types import StringType, StructField, TimestampType 14 | from pyspark.ml.feature import Bucketizer 15 | from driver.core import ValidationException, SchemaValidationException 16 | from driver.task_executor import DataSet 17 | 18 | from driver.util import check_property 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def null_validator(df: DataFrame, col_name: str, cfg: any = None): 24 | # null_value_ratio = df.select(count(when(col(col_name).isNull(), True)) / count(lit(1)).alias('count')) \ 25 | # .first()[0] 26 | # ('not_null', self.column, null_value_ratio <= self.threshold, self.threshold, null_value_ratio 27 | col = df.select(col_name) 28 | if col.filter((col[col_name].isNull()) | (col[col_name] == "")).count() > 0: 29 | raise ValidationException(f'Column: {col_name} is expected to be not null.') 30 | 31 | 32 | def regexp_validator(df: DataFrame, col_name: str, cfg: any = None): 33 | if not hasattr(cfg, 'value'): 34 | raise ValidationException(f'Column {col_name} has regexp constraint validator, but no value option provided.') 35 | col = df.select(col_name) 36 | if col.count() != col.filter(col[col_name].rlike(cfg.value)).count(): 37 | raise ValidationException(f"Column: [{col_name}] doesn't match regexp: {cfg.value}") 38 | 39 | 40 | def unique_validator(df: DataFrame, col_name: str, cfg: any = None): 41 | col = df.select(col_name) 42 | if col.distinct().count() != col.count(): 43 | raise ValidationException(f'Column: {col_name} is expected to be unique.') 44 | 45 | 46 | def resolve_time_delta(cfg): 47 | if hasattr(cfg, 'time_unit'): 48 | if cfg.time_unit == 'minutes': 49 | return timedelta(minutes=cfg.threshold) 50 | elif cfg.time_unit == 'hours': 51 | return timedelta(hours=cfg.threshold) 52 | elif cfg.time_unit == 'days': 53 | return timedelta(days=cfg.threshold) 54 | elif cfg.time_unit == 'weeks': 55 | return timedelta(weeks=cfg.threshold) 56 | elif cfg.time_unit == 'seconds': 57 | return timedelta(seconds=cfg.threshold) 58 | else: 59 | return timedelta(minutes=cfg.threshold) 60 | 61 | 62 | def past_validator(df: DataFrame, col_name: str, cfg: any = None): 63 | now = datetime.now() 64 | if cfg and hasattr(cfg, 'threshold'): 65 | now = now + resolve_time_delta(cfg) 66 | count = df.filter(df["trx_date"].cast(TimestampType()) >= lit(now)).count() 67 | if count > 0: 68 | raise ValidationException(f'Column {col_name} has values in the future (beyond {now}).') 69 | 70 | 71 | def future_validator(df: DataFrame, col_name: str, cfg: any = None): 72 | now = datetime.now() 73 | if cfg and hasattr(cfg, 'threshold'): 74 | now = now - resolve_time_delta(cfg) 75 | count = df.filter(df["trx_date"].cast(TimestampType()) <= lit(now)).count() 76 | if count > 0: 77 | raise ValidationException(f'Column {col_name} has values in the past (before {now}).') 78 | 79 | 80 | def freshness_validator(df: DataFrame, col_name: str, cfg: any = None): 81 | regex = re.compile('seconds|minutes|hours|days|weeks', re.I) 82 | if not hasattr(cfg, 'threshold') or not hasattr(cfg, 'time_unit') or not regex.match(str(cfg.time_unit)): 83 | raise ValidationException( 84 | f'[threshold] and [time_unit] options must be specified. Time units shoudl have one of the following values: seconds|minutes|hours|days|weeks.') 85 | if hasattr(cfg, 'group_by'): 86 | # df.withColumn("rn", row_number().over(Window.partitionBy(cfg.group_by).orderBy(col(col_name).desc()))) 87 | # df = df.filter(col("rn") == 1).drop("rn") 88 | res_df = df.select(col(col_name), col(cfg.group_by)).withColumn('rn', row_number().over( 89 | Window.partitionBy(cfg.group_by).orderBy(col(col_name).desc()))).filter(col('rn') == 1).drop('rn') 90 | threshold = datetime.now() - resolve_time_delta(cfg) 91 | for row in res_df.collect(): 92 | if row[col_name] < threshold: 93 | raise ValidationException( 94 | f'The most recent row for group [{cfg.group_by}] is older ({row[col_name]}) than the threshold ({threshold}).') 95 | else: 96 | threshold = datetime.now() - resolve_time_delta(cfg) 97 | most_recent = df.select(col(col_name)).orderBy(col(col_name).desc()).first()[col_name] 98 | if most_recent < threshold: 99 | raise ValidationException(f'The most recent row is older ({most_recent}) than the threshold ({threshold}).') 100 | 101 | 102 | def min_validator(df: DataFrame, col_name: str, cfg: any = None): 103 | # todo: implement min validator 104 | pass 105 | 106 | 107 | def max_validator(df: DataFrame, col_name: str, cfg: any = None): 108 | # todo: implement max validator 109 | pass 110 | 111 | 112 | constraint_validators = { 113 | "not_null": null_validator, 114 | "unique": unique_validator, 115 | "regexp": regexp_validator, 116 | "past": past_validator, 117 | "future": future_validator, 118 | "freshness": freshness_validator 119 | } 120 | 121 | 122 | def hasher(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame: 123 | # todo: implement salting 124 | return df.withColumn(col_name, hash(col(col_name))) 125 | 126 | 127 | def encrypt(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame: 128 | # todo: implement key handling + kms 129 | def encrypt_f(value: object, key: str = None): 130 | if key: 131 | return hashlib.sha256(str(value).encode() + key.encode()).hexdigest() 132 | else: 133 | return hashlib.sha256(str(value).encode()).hexdigest() 134 | 135 | encrypt_udf = udf(encrypt_f, StringType()) 136 | return df.withColumn(col_name, encrypt_udf(col_name, lit(cfg.key if hasattr(cfg, 'key') else None))) 137 | 138 | 139 | def skip_column(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame: 140 | return df.drop(col(col_name)) 141 | 142 | 143 | def rename_col(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame: 144 | # todo: update the schema for the dataset or remove this one 145 | return df.withColumnRenamed(col_name, cfg.name) 146 | 147 | 148 | def bucketize(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame: 149 | buckets = cfg.buckets.__dict__ 150 | bucket_labels = dict(zip(range(len(buckets.values())), buckets.values())) 151 | bucket_splits = [float(split) for split in buckets.keys()] 152 | bucket_splits.append(float('Inf')) 153 | 154 | bucketizer = Bucketizer(splits=bucket_splits, inputCol=col_name, outputCol="tmp_buckets") 155 | bucketed = bucketizer.setHandleInvalid("keep").transform(df) 156 | 157 | udf_labels = udf(lambda x: bucket_labels[x], StringType()) 158 | bucketed = bucketed.withColumn(col_name, udf_labels("tmp_buckets")) 159 | bucketed = bucketed.drop(col('tmp_buckets')) 160 | 161 | return bucketed 162 | 163 | 164 | built_in_transformers = { 165 | 'anonymize': hasher, 166 | 'encrypt': encrypt, 167 | 'skip': skip_column, 168 | 'bucketize': bucketize, 169 | 'rename_column': rename_col 170 | } 171 | 172 | 173 | def find_schema_delta(ds: DataSet) -> List[StructField]: 174 | def lookup(name, schema_list): 175 | return next(filter(lambda rsf: rsf.name == name, schema_list)) 176 | 177 | if check_property(ds, 'model.columns'): 178 | required_schema = common.remap_schema(ds) 179 | data_frame_fields = [{'name': x.name, 'type': x.dataType} for x in ds.df.schema] 180 | required_schema_fields = [{'name': x.name, 'type': x.dataType} for x in required_schema] 181 | delta_fields = [x for x in required_schema_fields if x not in data_frame_fields] 182 | return [lookup(x.get('name'), required_schema) for x in delta_fields] 183 | else: 184 | return None 185 | 186 | 187 | def type_caster(ds: DataSet): 188 | try: 189 | mismatched_fields = find_schema_delta(ds) 190 | for mismatched_field in mismatched_fields or []: 191 | logger.info( 192 | f'--> typecasting [{mismatched_field.name}] to type: [{mismatched_field.dataType.typeName()}] in [{ds.id}]') 193 | field_in_df = next(iter([f for f in ds.df.schema.fields if f.name == mismatched_field.name]), None) 194 | if field_in_df: 195 | ds.df = ds.df.withColumn(mismatched_field.name, 196 | col(mismatched_field.name).cast(mismatched_field.dataType.typeName())) 197 | return ds 198 | except Exception as e: 199 | raise e 200 | 201 | 202 | def schema_checker(ds: DataSet): 203 | if check_property(ds, 'model.columns'): 204 | logger.info( 205 | f'-> checking schema for dataset [{ds.id}] with model id: [{ds.model.id}]. Data frame columns: {len(ds.df.columns)}') 206 | missing_fields = find_schema_delta(ds) 207 | if missing_fields: 208 | raise SchemaValidationException( 209 | f'The following fields are missing from the data set [{ds.id}]: {missing_fields}. ' 210 | f'Current schema: {ds.df.schema}', 211 | ds) 212 | if hasattr(ds, 'model') and hasattr(ds.model, 'validation') and ds.model.validation == 'strict': 213 | if not hasattr(ds, 'df'): 214 | raise SchemaValidationException(f'The dataset [{ds.id}] is missing a dataframe with strict validation', 215 | ds) 216 | if len(ds.df.columns) != len(ds.model.columns): 217 | xtra = set(ds.df.columns) - set([x.id for x in ds.model.columns]) 218 | raise SchemaValidationException( 219 | f'The dataset [{ds.id}] has a dataframe with more columns ({xtra}) than stated in the model', ds) 220 | return ds 221 | 222 | 223 | def razor(ds: DataSet): 224 | if hasattr(ds.model, 'xtra_columns') and ds.model.xtra_columns == 'raze': 225 | xtra_columns = list(set(ds.df.columns) - set([x.id for x in ds.model.columns])) 226 | ds.df = ds.df.drop(*xtra_columns) 227 | return ds 228 | 229 | 230 | def constraint_processor(ds: DataSet): 231 | if not check_property(ds, 'model.columns'): 232 | return ds 233 | 234 | for col in ds.model.columns: 235 | if not hasattr(col, 'constraints'): 236 | continue 237 | constraint_types = [c.type for c in col.constraints] 238 | for ctype in constraint_types: 239 | cvalidator = constraint_validators.get(ctype) 240 | if cvalidator: 241 | constraint = next(iter([co for co in col.constraints if co.type == ctype]), None) 242 | constraint_opts = constraint.options if hasattr(constraint, 'options') else None 243 | cvalidator(ds.df, col.id, constraint_opts) 244 | return ds 245 | 246 | 247 | def transformer_processor(data_set: DataSet): 248 | """ 249 | Will run a prebuilt a transformation on each and every column of the model. 250 | :param data_set: the data set that contains the data frame; 251 | :return: the data set with the processed data frame 252 | """ 253 | if not check_property(data_set, 'model.columns'): 254 | return data_set 255 | for col in data_set.model.columns: 256 | if not hasattr(col, 'transform'): 257 | continue 258 | transformers = [t.type for t in col.transform] 259 | for trsfrm_type in transformers: 260 | tcall = built_in_transformers.get(trsfrm_type) 261 | if tcall: 262 | trsfrm = next(iter([to for to in col.transform if to.type == trsfrm_type]), None) 263 | trsfm_opts = trsfrm.options if trsfrm and hasattr(trsfrm, 'options') else None 264 | data_set.df = tcall(data_set.df, col.id, trsfm_opts) 265 | return data_set 266 | -------------------------------------------------------------------------------- /driver/schema/1.rc-1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "$id": "http://json-schema.org/draft-07/schema#", 4 | "title": "Data product schema", 5 | "type": "object", 6 | "required": [ 7 | "models", 8 | "schema_version" 9 | ], 10 | "additionalProperties": false, 11 | "properties": { 12 | "schema_version": { 13 | "type": "string", 14 | "description": "The version of this schema file" 15 | }, 16 | "models": { 17 | "type": "array", 18 | "minItems": 1, 19 | "items": { 20 | "$ref": "#/$defs/model" 21 | } 22 | } 23 | }, 24 | "$defs": { 25 | "model": { 26 | "type": "object", 27 | "additionalProperties": false, 28 | "required": [ 29 | "id", 30 | "version", 31 | "columns" 32 | ], 33 | "properties": { 34 | "id": { 35 | "type": "string", 36 | "minLength": 1 37 | }, 38 | "name": { 39 | "type": "string", 40 | "minLength": 1 41 | }, 42 | "version": { 43 | "type": "string", 44 | "minLength": 1 45 | }, 46 | "xtra_columns": { 47 | "type": "string", 48 | "enum": [ 49 | "raze", 50 | "ignore" 51 | ] 52 | }, 53 | "validation": { 54 | "type": "string", 55 | "enum": [ 56 | "strict", 57 | "lazy" 58 | ] 59 | }, 60 | "extends": { 61 | "type": "string", 62 | "minLength": 1 63 | }, 64 | "description": { 65 | "type": "string", 66 | "minLength": 1 67 | }, 68 | "meta": { 69 | "type": "object", 70 | "additionalProperties": true 71 | }, 72 | "storage": { 73 | "$ref": "#/$defs/storage" 74 | }, 75 | "tags": { 76 | "type": "object", 77 | "additionalProperties": true 78 | }, 79 | "access": { 80 | "type": "object", 81 | "additionalProperties": true, 82 | "properties": { 83 | "domain": { 84 | "type": "string", 85 | "minLength": 1 86 | }, 87 | "confidentiality": { 88 | "type": "string", 89 | "minLength": 1 90 | } 91 | } 92 | }, 93 | "columns": { 94 | "type": "array", 95 | "minItems": 1, 96 | "items": { 97 | "$ref": "#/$defs/column" 98 | } 99 | } 100 | } 101 | }, 102 | "storage": { 103 | "type": "object", 104 | "required": [ 105 | "type" 106 | ], 107 | "additionalProperties": false, 108 | "properties": { 109 | "type": { 110 | "type": "string" 111 | }, 112 | "format": { 113 | "type": "string" 114 | }, 115 | "options": { 116 | "type": "object", 117 | "additionalProperties": true 118 | }, 119 | "location": { 120 | "type": "string", 121 | "pattern": "([^ !$`&*()+]|(\\\\[ !$`&*()+]))+" 122 | } 123 | } 124 | }, 125 | "column": { 126 | "type": "object", 127 | "additionalProperties": false, 128 | "required": [ 129 | "id" 130 | ], 131 | "properties": { 132 | "id": { 133 | "type": "string", 134 | "minLength": 1 135 | }, 136 | "type": { 137 | "type": "string", 138 | "minLength": 1 139 | }, 140 | "source": { 141 | "type": [ 142 | "array", 143 | "string" 144 | ], 145 | "items": { 146 | "type": "string" 147 | }, 148 | "minLength": 1 149 | }, 150 | "name": { 151 | "type": "string", 152 | "minLength": 1 153 | }, 154 | "description": { 155 | "type": "string", 156 | "minLength": 1 157 | }, 158 | "transform": { 159 | "type": "array", 160 | "minItems": 1, 161 | "items": { 162 | "type": "object", 163 | "properties": { 164 | "type": { 165 | "type": "string", 166 | "minLength": 1 167 | }, 168 | "options": { 169 | "type": "object" 170 | } 171 | } 172 | } 173 | }, 174 | "constraints": { 175 | "type": "array", 176 | "minItems": 1, 177 | "items": { 178 | "type": "object", 179 | "properties": { 180 | "type": { 181 | "type": "string", 182 | "minLength": 1 183 | }, 184 | "options": { 185 | "type": "object" 186 | } 187 | } 188 | } 189 | } 190 | } 191 | } 192 | } 193 | } -------------------------------------------------------------------------------- /driver/schema/1.rc-1/product.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "$id": "http://json-schema.org/draft-07/schema#", 4 | "title": "Data Product Schema", 5 | "description": "Used to validate the product.yml that defines the data product execution pipeline", 6 | "type": "object", 7 | "required": [ 8 | "schema_version", 9 | "product" 10 | ], 11 | "additionalProperties": false, 12 | "properties": { 13 | "schema_version": { 14 | "type": "string", 15 | "description": "The version of this schema file" 16 | }, 17 | "product": { 18 | "type": "object", 19 | "required": [ 20 | "id", 21 | "version", 22 | "owner", 23 | "pipeline", 24 | "description" 25 | ], 26 | "additionalProperties": false, 27 | "properties": { 28 | "id": { 29 | "type": "string", 30 | "minLength": 1 31 | }, 32 | "version": { 33 | "type": "string", 34 | "minLength": 1 35 | }, 36 | "owner": { 37 | "type": "string", 38 | "pattern": "^([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\\.[A-Z|a-z]{2,})+$" 39 | }, 40 | "name": { 41 | "type": "string", 42 | "minLength": 1 43 | }, 44 | "description": { 45 | "type": "string", 46 | "minLength": 1 47 | }, 48 | "defaults": { 49 | "$ref": "#/$defs/defaults" 50 | }, 51 | "engine": { 52 | "type": "string", 53 | "enum": [ 54 | "glue", 55 | "emr", 56 | "dbt" 57 | ] 58 | }, 59 | "pipeline": { 60 | "$ref": "#/$defs/pipeline" 61 | } 62 | } 63 | } 64 | }, 65 | "$defs": { 66 | "defaults": { 67 | "type": "object" 68 | }, 69 | "pipeline": { 70 | "type": "object", 71 | "required": [ 72 | "schedule", 73 | "tasks" 74 | ], 75 | "additionalProperties": false, 76 | "properties": { 77 | "schedule": { 78 | "type": "string", 79 | "minLength": 1 80 | }, 81 | "tasks": { 82 | "type": "array", 83 | "items": { 84 | "$ref": "#/$defs/task" 85 | } 86 | } 87 | } 88 | }, 89 | "task": { 90 | "type": "object", 91 | "required": [ 92 | "id", 93 | "inputs", 94 | "outputs" 95 | ], 96 | "additionalProperties": false, 97 | "properties": { 98 | "id": { 99 | "type": "string", 100 | "minLength": 1 101 | }, 102 | "logic": { 103 | "$ref": "#/$defs/task_logic" 104 | }, 105 | "inputs": { 106 | "type": "array", 107 | "items": { 108 | "anyOf": [ 109 | { 110 | "$ref": "#/$defs/io_type_connection" 111 | }, 112 | { 113 | "$ref": "#/$defs/io_type_model" 114 | }, 115 | { 116 | "$ref": "#/$defs/io_type_file" 117 | } 118 | ] 119 | } 120 | }, 121 | "outputs": { 122 | "type": "array", 123 | "items": { 124 | "anyOf": [ 125 | { 126 | "$ref": "#/$defs/io_type_connection" 127 | }, 128 | { 129 | "$ref": "#/$defs/io_type_model" 130 | }, 131 | { 132 | "$ref": "#/$defs/io_type_file" 133 | } 134 | ] 135 | } 136 | } 137 | } 138 | }, 139 | "task_logic": { 140 | "type": "object", 141 | "additionalProperties": false, 142 | "required": [ 143 | "module" 144 | ], 145 | "properties": { 146 | "module": { 147 | "type": "string", 148 | "minLength": 1 149 | }, 150 | "parameters": { 151 | "type": "object", 152 | "additionalProperties": true 153 | } 154 | } 155 | }, 156 | "io_type_connection": { 157 | "type": "object", 158 | "required": [ 159 | "connection", 160 | "table" 161 | ], 162 | "additionalProperties": true, 163 | "properties": { 164 | "connection": { 165 | "type": "string", 166 | "minLength": 1 167 | }, 168 | "table": { 169 | "type": "string", 170 | "minLength": 1 171 | }, 172 | "model": { 173 | "type": "string", 174 | "minLength": 1 175 | } 176 | } 177 | }, 178 | "io_type_model": { 179 | "type": "object", 180 | "required": [ 181 | "model" 182 | ], 183 | "additionalProperties": false, 184 | "properties": { 185 | "model": { 186 | "type": "string", 187 | "mindLength": 1 188 | } 189 | } 190 | }, 191 | "io_type_file": { 192 | "type": "object", 193 | "required": [ 194 | "file" 195 | ], 196 | "additionalProperties": true, 197 | "properties": { 198 | "file": { 199 | "type": "string", 200 | "minLength": 1 201 | }, 202 | "model": { 203 | "type": "string", 204 | "minLength": 1 205 | }, 206 | "options": { 207 | "type": "object", 208 | "additionalProperties": false, 209 | "properties": { 210 | "type": { 211 | "type": "string", 212 | "minLength": 3 213 | }, 214 | "infer_schema": { 215 | "type": "boolean" 216 | }, 217 | "separator": { 218 | "type": "string", 219 | "minLength": 1 220 | }, 221 | "header": { 222 | "type": "boolean" 223 | } 224 | } 225 | } 226 | } 227 | } 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /driver/task_executor.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import importlib 5 | import logging 6 | 7 | import sys 8 | 9 | from typing import List, Callable, Dict 10 | from .util import filter_list_by_id, enrich_models 11 | from .core import DataSet, DataProduct, IOType, ProcessorChainExecutionException, ValidationException, \ 12 | resolve_data_set_id, ResolverException, resolve_data_product_id, ConfigContainer 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | data_src_handlers: dict = dict() 17 | pre_processors: list = list() 18 | post_processors: list = list() 19 | transformers: dict = dict() 20 | output_handlers: dict = dict() 21 | 22 | 23 | def register_data_source_handler(src_type_id: str, handler: callable): 24 | data_src_handlers.update({src_type_id: handler}) 25 | 26 | 27 | def register_preprocessors(*handlers: callable): 28 | pre_processors.extend(handlers) 29 | 30 | 31 | def register_postprocessors(*handlers: callable): 32 | post_processors.extend(handlers) 33 | 34 | 35 | def register_transformer(transformer_id: str, handler: callable): 36 | transformers.update({transformer_id: handler}) 37 | 38 | 39 | def add_transformers(additional_transformers: Dict[str, callable]): 40 | transformers.update(additional_transformers) 41 | 42 | 43 | def register_output_handler(output_handler_type: str, handler: callable): 44 | output_handlers.update({output_handler_type: handler}) 45 | 46 | 47 | def resolve_io_type(io_definition: ConfigContainer) -> IOType: 48 | if hasattr(io_definition, IOType.connection.name): 49 | return IOType.connection 50 | elif hasattr(io_definition, IOType.file.name): 51 | return IOType.file 52 | elif hasattr(io_definition, IOType.model.name): 53 | return IOType.model 54 | else: 55 | raise ResolverException(f'This IO type is not supported yet: {io_definition.__repr__()}.') 56 | 57 | 58 | def load_inputs(product: ConfigContainer, inputs: ConfigContainer, models: List[ConfigContainer]) -> List[DataSet]: 59 | input_datasets: list[DataSet] = list() 60 | 61 | def load_input(input_def): 62 | handle_input = data_src_handlers.get(input_def.type) 63 | if not handle_input: 64 | raise Exception(f"Input source handler [{input_def.type}] not registered.") 65 | return handle_input(input_def) 66 | 67 | for inp in inputs: 68 | model_id = inp.model if hasattr(inp, 'model') else None 69 | setattr(inp, 'type', resolve_io_type(inp)) 70 | 71 | # dataset_id is build as follows 72 | # file: 73 | # model: . 74 | # connection: . 75 | data_product_id = resolve_data_product_id(inp) 76 | dataset_id = f'{data_product_id}.{resolve_data_set_id(inp)}' if data_product_id else resolve_data_set_id(inp) 77 | 78 | model_obj = filter_list_by_id(models, model_id) 79 | 80 | dp = DataProduct(id=product.id, description=getattr(product, 'description', None), 81 | owner=getattr(product, 'owner', None)) 82 | input_datasets.append(DataSet(dataset_id, load_input(inp), model_obj, dp)) 83 | return input_datasets 84 | 85 | 86 | def run_processors(phase: str, datasets: List[DataSet], processors: List[Callable]) -> List[DataSet]: 87 | try: 88 | processed_dfs: list[datasets] = datasets 89 | for processor in processors: 90 | logger.info(f'-> running processor: [{processor.__name__}]') 91 | new_dss: list[datasets] = list() 92 | for ds in processed_dfs: 93 | new_dss.append(processor(ds)) 94 | processed_dfs = new_dss 95 | return processed_dfs 96 | except ValidationException as vex: 97 | raise ProcessorChainExecutionException( 98 | f'{type(vex).__name__} in processor [{processor.__name__}] at processor chain: [{phase}]: {str(vex)}') from vex 99 | except Exception as e: 100 | raise ProcessorChainExecutionException( 101 | f'{type(e).__name__} in [{processor.__name__}] at processor chain: [{phase}]: {str(e)}') from e 102 | 103 | 104 | def transform(inp_dfs: List[DataSet], product_path: str, custom_module_name, params=None) -> List[DataSet]: 105 | from driver.driver import get_spark 106 | sys.path.append(product_path) 107 | logger.info(f'executing custom module: {custom_module_name}') 108 | custom_module = importlib.import_module(custom_module_name) 109 | sys.modules[custom_module_name] = custom_module 110 | 111 | spark = get_spark() 112 | if params: 113 | return custom_module.execute(inp_dfs, spark, **params) 114 | else: 115 | return custom_module.execute(inp_dfs, spark) 116 | 117 | 118 | def sink(o_dfs: List[DataSet]): 119 | for out_dataset in o_dfs: 120 | handle_output = output_handlers.get(out_dataset.storage_type) 121 | if not handle_output: 122 | raise Exception(f'Storage handler identified by {out_dataset.storage_type} is not found.') 123 | handle_output(out_dataset) 124 | 125 | 126 | def enrich(datasets: List[DataSet], product: ConfigContainer, models: List[ConfigContainer]): 127 | for dataset in datasets: 128 | if not dataset.product_id: 129 | dataset.product_id = product.id 130 | if not dataset.product_owner: 131 | dataset.product.owner = getattr(product, 'owner', None) 132 | if dataset.model is None: 133 | default_model = enrich_models(ConfigContainer(models=[ConfigContainer(id=dataset.id)]), product=product)[0] 134 | model_obj = next(iter([m for m in models if m.id == dataset.id]), default_model) 135 | dataset.model = model_obj 136 | return datasets 137 | 138 | 139 | def filter_output_models(task_outputs: List[ConfigContainer], models: List[ConfigContainer]): 140 | output_model_names = [to.model for to in task_outputs if hasattr(to, 'model')] 141 | return [model for model in models if model.id in output_model_names] 142 | 143 | 144 | def execute(product: ConfigContainer, task: ConfigContainer, models: List[ConfigContainer], product_path: str) \ 145 | -> List[DataSet]: 146 | logger.info(f'executing tasks > [{task.id}] for data product [{product.id}].') 147 | 148 | output_models = filter_output_models(task.outputs, models) 149 | input_dfs: list[DataSet] = run_processors('pre', load_inputs(product, task.inputs, models), pre_processors) 150 | input_dfs = enrich(input_dfs, product, output_models) 151 | 152 | task_logic_module = task.logic.module if hasattr(task, 'logic') and hasattr(task.logic, 153 | 'module') else 'builtin.ingest' 154 | task_logic_params = task.logic.parameters.__dict__ if hasattr(task, 'logic') and hasattr(task.logic, 155 | 'parameters') else {} 156 | output_dfs: list[DataSet] = transform(input_dfs, product_path, task_logic_module, task_logic_params) 157 | output_dfs = enrich(output_dfs, product, output_models) 158 | 159 | sink(run_processors('post', output_dfs, post_processors)) 160 | 161 | return output_dfs 162 | -------------------------------------------------------------------------------- /driver/util.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import functools 5 | from io import DEFAULT_BUFFER_SIZE 6 | import json 7 | import logging 8 | import os 9 | import yaml 10 | 11 | from typing import List, Any 12 | from jsonschema import validate, ValidationError, Draft3Validator 13 | from yaml.scanner import ScannerError 14 | from driver.core import ArtefactType, ConfigContainer 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def run_chain(input_payload, *callables: callable): 20 | functions = list() 21 | functions.extend(callables) 22 | result = input_payload 23 | for func in functions: 24 | func_name = func.func.__name__ if isinstance(func, functools.partial) else func.__name__ 25 | logger.info(f'chain > executing: {func_name}') 26 | try: 27 | result = func(result) 28 | except Exception as exc: 29 | logger.error(f'{type(exc).__name__} while executing <{func_name}> with error: {str(exc)}') 30 | raise 31 | return result 32 | 33 | 34 | def parse_dict_into_object(d: dict): 35 | x = ConfigContainer() 36 | for k, v in d.items(): 37 | if isinstance(v, dict): 38 | setattr(x, k, parse_dict_into_object(v)) 39 | elif isinstance(v, list): 40 | object_list = list() 41 | for e in v: 42 | object_list.append(parse_dict_into_object(e) if isinstance(e, dict) else e) 43 | setattr(x, k, object_list) 44 | else: 45 | setattr(x, str(k), v) 46 | return x 47 | 48 | 49 | def load_yaml(file_path: str): 50 | logger.info(f'loading file {file_path}') 51 | try: 52 | with open(fr'{file_path}') as file: 53 | return yaml.safe_load(file) 54 | except ScannerError as scerr: 55 | logger.error(f'Could not read [{file_path}] due to: {str(scerr)}') 56 | raise scerr 57 | 58 | 59 | def safe_get_property(object: Any, property: str): 60 | return getattr(object, property) if hasattr(object, property) else None 61 | 62 | 63 | def check_property(object, nested_property: str): 64 | """ 65 | :param object: the object to analyze 66 | :param nested_property: the nested properties separated by dots (.) (eg. model.storage.location) 67 | :return: True if the nested property can be found on the object; 68 | """ 69 | current_object = object 70 | for element in nested_property.split('.'): 71 | if hasattr(current_object, element): 72 | current_object = getattr(current_object, element) 73 | else: 74 | return False 75 | return True 76 | 77 | 78 | def filter_list_by_id(object_list, object_id): 79 | return next(iter([m for m in object_list if m.id == object_id]), None) 80 | 81 | 82 | def validate_schema(validable_dict: dict, artefact_type: ArtefactType): 83 | schema_vesion = validable_dict.get('schema_version') 84 | if not schema_vesion: 85 | raise ValidationError('schema_version keyword must be provided') 86 | script_folder = os.path.dirname(os.path.abspath(__file__)) 87 | schema_path = os.path.join(script_folder, 'schema', schema_vesion, f'{artefact_type.name}.json') 88 | with open(schema_path) as schema: 89 | schema = json.load(schema) 90 | try: 91 | validate(validable_dict, schema) 92 | except ValidationError as verr: 93 | for err in sorted(Draft3Validator(schema).iter_errors(validable_dict), key=str): 94 | logger.error(f'validation error detail: {err.message}') 95 | logger.error(f"{type(verr).__name__} while checking [{artefact_type.name}]: {str(verr)}") 96 | raise verr 97 | return validable_dict 98 | 99 | 100 | def enrich_product(product_input: ConfigContainer, args): 101 | # todo: replace this with a proper object merging logic 102 | product = product_input.product 103 | if not hasattr(product, 'defaults'): 104 | setattr(product, 'defaults', ConfigContainer()) 105 | if hasattr(args, 'default_data_lake_bucket') and not hasattr(product.defaults, 'storage'): 106 | storage = ConfigContainer() 107 | setattr(storage, 'location', args.default_data_lake_bucket) 108 | logger.debug(f'product defaults {product.defaults}') 109 | setattr(product.defaults, 'storage', storage) 110 | if not check_property(product, 'defaults.storage.location'): 111 | setattr(product.defaults.storage, 'location', args.default_data_lake_bucket) 112 | return product 113 | 114 | 115 | def enrich_models(models: ConfigContainer, product: ConfigContainer): 116 | def add_back_types(model, extended_model): 117 | columns_with_missing_type = [col for col in model.columns if not hasattr(col, 'type')] 118 | for col in columns_with_missing_type: 119 | setattr(col, 'type', filter_list_by_id(extended_model.columns, col.id).type) 120 | 121 | def decorate_model_with_defaults(model): 122 | if hasattr(product, 'defaults'): 123 | if not hasattr(model, 'storage') and hasattr(product.defaults, 'storage'): 124 | setattr(model, 'storage', product.defaults.storage) 125 | if not hasattr(model.storage, 'location') and hasattr(product.defaults.storage, 'location'): 126 | setattr(model.storage, 'location', product.defaults.storage.location) 127 | if not hasattr(model.storage, 'options') and hasattr(product.defaults.storage, 'options'): 128 | setattr(model.storage, 'options', product.defaults.storage.options) 129 | if not hasattr(model.storage, 'type'): 130 | setattr(model.storage, 'type', 'lake') 131 | if not hasattr(model.storage, 'format'): 132 | setattr(model.storage, 'format', 'parquet') 133 | return model 134 | 135 | compiled_models = list() 136 | for model in models.models: 137 | if hasattr(model, 'extends'): 138 | extended_model = filter_list_by_id(models.models, model.extends) 139 | if not extended_model: 140 | raise Exception( 141 | f'Cannot extend model {model.id} with {extended_model} because the root model is not found.') 142 | current_model_columns = set([col.id for col in model.columns]) 143 | extended_model_columns = set([col.id for col in extended_model.columns]) 144 | inherited_column_ids = extended_model_columns - current_model_columns 145 | inherited_columns = [filter_list_by_id(extended_model.columns, col_id) for col_id in inherited_column_ids] 146 | model.columns.extend(inherited_columns) 147 | add_back_types(model, extended_model) 148 | compiled_models.append(decorate_model_with_defaults(model)) 149 | return compiled_models 150 | 151 | 152 | def compile_product(product_path: str, args, prod_def_filename: str = 'product.yml'): 153 | part_enrich_product = functools.partial(enrich_product, args=args) 154 | part_validate_schema = functools.partial(validate_schema, artefact_type=ArtefactType.product) 155 | product_path = os.path.join(product_path, prod_def_filename) 156 | product_processing_chain = [load_yaml, part_validate_schema, parse_dict_into_object, part_enrich_product] 157 | return run_chain(product_path, *product_processing_chain) 158 | 159 | 160 | def compile_models(product_path: str, product: ConfigContainer, def_file_name: str = 'model.yml') -> List[ 161 | ConfigContainer]: 162 | model_path = os.path.join(product_path, def_file_name) 163 | part_validate_schema = functools.partial(validate_schema, artefact_type=ArtefactType.model) 164 | part_enrich_model = functools.partial(enrich_models, product=product) 165 | model_processing_chain = [load_yaml, part_validate_schema, parse_dict_into_object, part_enrich_model] 166 | return run_chain(model_path, *model_processing_chain) 167 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import configparser 5 | import importlib 6 | import logging 7 | import os 8 | import argparse 9 | import sys 10 | 11 | from pyspark import SparkConf 12 | import traceback 13 | import driver 14 | import driver.aws.providers 15 | from driver.aws.providers import connection_provider, datalake_provider 16 | from driver.driver import get_spark 17 | from driver.io_handlers import connection_input_handler, lake_input_handler, file_input_handler 18 | from driver.processors import schema_checker, constraint_processor, transformer_processor, type_caster, razor 19 | from driver.io_handlers import lake_output_handler, connection_input_handler 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def init_aws(args): 25 | profile = None 26 | region = None 27 | if hasattr(args, 'aws_profile'): 28 | profile = args.aws_profile 29 | if hasattr(args, 'aws_region'): 30 | region = args.aws_region 31 | driver.aws.providers.init(profile=profile, region=region) 32 | 33 | 34 | def build_spark_configuration(args, config: configparser.RawConfigParser, custom_hook: callable = None): 35 | conf = SparkConf() 36 | if hasattr(args, 'aws_profile'): 37 | logger.info(f'Setting aws profile: {args.aws_profile}') 38 | os.environ["AWS_PROFILE"] = args.aws_profile 39 | conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider") 40 | if hasattr(args, 'local') and args.local: 41 | """ local execution, dependencies should be configured """ 42 | deps_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'spark_deps') 43 | local_jars = [file for file in os.listdir(deps_path) if file.endswith('.jar')] 44 | if hasattr(args, 'jars'): 45 | local_jars.extend([f'{deps_path}/{j}' for j in args.jars.strip().split(',')]) 46 | jars = ','.join([os.path.join(deps_path, j) for j in local_jars]) 47 | conf.set("spark.jars", jars) 48 | if config: 49 | spark_jars = 'spark jars' 50 | if spark_jars in config.sections(): 51 | for k, v in config.items(spark_jars): 52 | conf.set(k, v) 53 | return custom_hook.enrich_spark_conf(conf) if custom_hook and hasattr(custom_hook, 'enrich_spark_conf') else conf 54 | 55 | 56 | def read_config(product_path: str) -> configparser.RawConfigParser: 57 | config_path = os.path.join(product_path, 'config.ini') 58 | if os.path.isfile(config_path): 59 | config = configparser.ConfigParser() 60 | config.read(config_path) 61 | return config 62 | else: 63 | return None 64 | 65 | 66 | def get_custom_hook(product_path: str) -> callable: 67 | hook_module_name = 'init_hook' 68 | hook_file = f'{hook_module_name}.py' 69 | hook_file_name = os.path.join(product_path, hook_file) 70 | if os.path.exists(hook_file_name): 71 | sys.path.append(product_path) 72 | logger.info(f'executing custom hooks: {hook_file_name}') 73 | module = importlib.import_module(hook_module_name) 74 | sys.modules[hook_module_name] = module 75 | return module 76 | else: 77 | return None 78 | 79 | 80 | def init_system(args): 81 | driver.io_handlers.init(connection_provider, datalake_provider) 82 | rel_product_path = os.path.join(args.product_path, '') if hasattr(args, 'product_path') else os.path.join('./', '') 83 | product_path = os.path.join(os.path.abspath(rel_product_path), '') 84 | config = read_config(product_path) 85 | custom_hook = get_custom_hook(product_path) 86 | driver.init(spark_config=build_spark_configuration(args, config, custom_hook)) 87 | logger.debug(f'using Spark configuration: {get_spark().sparkContext.getConf().getAll()}') 88 | logger.debug(f'the following jar packages are deployed: {get_spark().sparkContext._jsc.sc().listJars()}') 89 | driver.install_dependencies(product_path) 90 | driver.register_data_source_handler('connection', connection_input_handler) 91 | driver.register_data_source_handler('model', lake_input_handler) 92 | driver.register_data_source_handler('file', file_input_handler) 93 | driver.register_postprocessors(transformer_processor, razor, constraint_processor, type_caster, schema_checker) 94 | driver.register_output_handler('default', lake_output_handler) 95 | driver.register_output_handler('lake', lake_output_handler) 96 | if custom_hook: 97 | if hasattr(custom_hook, 'add_post_processors'): 98 | driver.register_postprocessors(*custom_hook.add_post_processors()) 99 | if hasattr(custom_hook, 'add_pre_processors'): 100 | driver.register_preprocessors(*custom_hook.add_pre_processors()) 101 | # if hasattr(custom_hook, 'add_transformers'): 102 | # driver.add_transformers(custom_hook.add_transformers()) 103 | # todo: the transformer dict is not used, the processor built-in transformers are the only ones looked up now 104 | driver.process_product(args, product_path) 105 | 106 | 107 | def main(): 108 | try: 109 | parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) 110 | parser.add_argument('--JOB_ID', help='the unique id of this Glue job') 111 | parser.add_argument('--JOB_RUN_ID', help='the unique id of this Glue job run') 112 | parser.add_argument('--JOB_NAME', help='the name of this Glue job') 113 | parser.add_argument('--job-bookmark-option', help="job-bookmark-disable if you don't want bookmarking") 114 | parser.add_argument('--TempDir', help='temporary results directory') 115 | parser.add_argument('--product_path', help='the data product definition folder') 116 | parser.add_argument('--aws_profile', help='the AWS profile to be used for connection') 117 | parser.add_argument('--aws_region', help='the AWS region to be used') 118 | parser.add_argument('--local', action='store_true', help='local development') 119 | parser.add_argument('--jars', help='extra jars to be added to the Spark context') 120 | parser.add_argument('--additional-python-modules', help='this is used by Glue, ignored by this code') 121 | parser.add_argument('--default_data_lake_bucket', help='Data Mesh output S3 bucket name', default=None) 122 | parser.add_argument('--log_level', choices=['debug', 'info', 'warning'], help='Set the desired log level', default='info') 123 | args, unknown = parser.parse_known_args() 124 | logging.basicConfig() 125 | log_level = logging.getLevelName(args.log_level.upper()) 126 | logger.setLevel(log_level) 127 | logging.getLogger('driver').setLevel(log_level) 128 | logger.info(f"KNOWN_ARGS: {args}") 129 | logger.info(f"UNKNOWN_ARGS: {unknown}") 130 | logger.info(f'PATH: {os.environ["PATH"]}') 131 | logger.info(f'SPARK_HOME: {os.environ.get("SPARK_HOME")}') 132 | logger.info(f'PYTHONPATH: {os.environ.get("PYTHONPATH")}') 133 | 134 | init_aws(args) 135 | if hasattr(args, "JOB_NAME") and not (hasattr(args, 'local') and args.local): 136 | import zipfile 137 | 138 | with zipfile.ZipFile(f'{os.path.dirname(os.path.abspath(__file__))}/{args.JOB_NAME}.zip', 'r') as zip_ref: 139 | zip_ref.extractall(f'{os.path.dirname(os.path.abspath(__file__))}/') 140 | init_system(args=args) 141 | except Exception as e: 142 | logging.exception(e) 143 | traceback.print_exc() 144 | raise e 145 | 146 | 147 | if __name__ == '__main__': 148 | main() 149 | -------------------------------------------------------------------------------- /package.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | [pytest] 5 | addopts = -v --durations=0 6 | log_cli = 1 7 | log_cli_level = INFO 8 | #addopts = -v --durations=0 -s --log-cli-level 1 9 | testpaths = tests 10 | python_files = test_*.py *_test.py *_tests.py 11 | #env = PYTHONPATH=/glue-libs/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/python:/glue-libs/aws-glue-libs/target/AWSGlueETLPython-1.0.0.jar:$PYTHONPATH 12 | markers = 13 | integration: test which require connection to real resources 14 | ignore: tests which should not run 15 | mock_use_standalone_module = false 16 | #nosecuredirs=tests/retired/ -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | -r requirements.txt 5 | 6 | pyspark 7 | pyspark-stubs 8 | pytest-spark 9 | pytest-mock 10 | pytest-helpers-namespace 11 | pytest-env 12 | pytest-cov 13 | pytest 14 | numpy 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | boto3==1.18.34 5 | botocore 6 | wheel==0.38.1 7 | pyyaml==5.4.1 8 | pydantic==1.10.10 9 | quinn 10 | boto3-stubs[glue]==1.18.34 11 | mypy-boto3-glue==1.18.34 12 | jsonschema==3.0.2 13 | pyspark==3.4.0 14 | numpy>=1.19.5 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.0.4 3 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?P
.*)
 4 | serialize = 
 5 | 	{major}.{minor}.{patch}{pre}
 6 | 	{major}.{minor}.{patch}
 7 | 
 8 | [bumpversion:file:setup.py]
 9 | 
10 | [bumpversion:file:version.sh]
11 | 
12 | [bdist_wheel]
13 | universal = 0
14 | 
15 | [aliases]
16 | test = pytest
17 | 
18 | [metadata]
19 | description-file = README.md
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | from os import path
 4 | from pip._internal.req import parse_requirements
 5 | from setuptools import setup, find_packages, Command
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
10 |     long_description = f.read()
11 | 
12 | requirements = [str(ir.requirement) for ir in parse_requirements(
13 |     'requirements.txt', session=False)]
14 | 
15 | 
16 | class CleanCommand(Command):
17 |     user_options = []
18 | 
19 |     def initialize_options(self):
20 |         pass
21 | 
22 |     def finalize_options(self):
23 |         pass
24 | 
25 |     def run(self):
26 |         system(
27 |             'rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info ./htmlcov '
28 |             './spark-warehouse ./driver/spark-warehouse ./metastore_db ./coverage_html ./.pytest_cache ./derby.log ./tests/local_results ./tasks/__pycache__')
29 | 
30 | 
31 | setup(
32 |     name="data-product-processor",
33 |     version="1.0.4",
34 |     description="The data product processor (dpp) is a library for dynamically creating and executing Apache Spark Jobs based on a declarative description of a data product.",
35 |     long_description=long_description,
36 |     long_description_content_type='text/markdown',
37 |     author="Amazon Web Services",
38 |     url = 'https://github.com/aws-samples/dpac-data-product-processor',
39 |     packages=find_packages(
40 |         exclude=(
41 |             "contrib",
42 |             "docs",
43 |             "tests",
44 |         )
45 |     ),
46 |     py_modules=[
47 |         'main',
48 |     ],
49 |     install_requires=requirements,
50 |     include_package_data=True,
51 |     platforms="any",
52 |     license="Apache License 2.0",
53 |     zip_safe=False,
54 |     cmdclass={
55 |         'clean_all': CleanCommand,
56 |         # 'package': Package
57 |     },
58 |     entry_points={
59 |         "console_scripts": ["data-product-processor=main:main"],
60 |     },
61 | )
62 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/aws_glue_dc_connection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Connection": {
 3 |     "ConnectionProperties": {
 4 |       "JDBC_CONNECTION_URL": "jdbc:postgresql://deng-pub-immersion-day.cofepwz7osto.eu-west-1.rds.amazonaws.com:5432/sportstickets",
 5 |       "JDBC_ENFORCE_SSL": "false",
 6 |       "PASSWORD": "some_pass",
 7 |       "USERNAME": "some_username"
 8 |     },
 9 |     "ConnectionType": "JDBC",
10 |     "CreationTime": "2021-08-11 15:44:08.285000+02:00",
11 |     "Description": "The connection to the test database",
12 |     "LastUpdatedTime": "2021-08-11 15:44:08.285000+02:00",
13 |     "Name": "test_db_connection",
14 |     "PhysicalConnectionRequirements": {
15 |       "AvailabilityZone": "eu-central-1a",
16 |       "SecurityGroupIdList": [
17 |         "sg-00d42f53e8f8b1963"
18 |       ],
19 |       "SubnetId": "subnet-0cbf5810a184046ce"
20 |     }
21 |   },
22 |   "ResponseMetadata": {
23 |     "HTTPHeaders": {
24 |       "connection": "keep-alive",
25 |       "content-length": "566",
26 |       "content-type": "application/x-amz-json-1.1",
27 |       "date": "Mon, 30 Aug 2021 12:36:13 GMT",
28 |       "x-amzn-requestid": "8f3e12d5-9626-4407-b217-62ce80b8c460"
29 |     },
30 |     "HTTPStatusCode": 200,
31 |     "RequestId": "8f3e12d5-9626-4407-b217-62ce80b8c460",
32 |     "RetryAttempts": 0
33 |   }
34 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_dc_get_db_rsp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Database": {
 3 |     "CatalogId": "588942721560",
 4 |     "CreateTableDefaultPermissions": [
 5 |       {
 6 |         "Permissions": [
 7 |           "ALL"
 8 |         ],
 9 |         "Principal": {
10 |           "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
11 |         }
12 |       }
13 |     ],
14 |     "CreateTime": "2021-09-03 12:27:22+02:00",
15 |     "Name": "customers"
16 |   },
17 |   "ResponseMetadata": {
18 |     "HTTPHeaders": {
19 |       "connection": "keep-alive",
20 |       "content-length": "246",
21 |       "content-type": "application/x-amz-json-1.1",
22 |       "date": "Fri, 03 Sep 2021 15:57:29 GMT",
23 |       "x-amzn-requestid": "7f953ee9-9221-4953-8a0e-4dc3917cdfe2"
24 |     },
25 |     "HTTPStatusCode": 200,
26 |     "RequestId": "7f953ee9-9221-4953-8a0e-4dc3917cdfe2",
27 |     "RetryAttempts": 0
28 |   }
29 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_dc_get_db_rsps.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "DatabaseList": [
 3 |     {
 4 |       "CatalogId": "588942721560",
 5 |       "CreateTableDefaultPermissions": [
 6 |         {
 7 |           "Permissions": [
 8 |             "ALL"
 9 |           ],
10 |           "Principal": {
11 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
12 |           }
13 |         }
14 |       ],
15 |       "CreateTime": "2021-09-03 12:27:22+02:00",
16 |       "Name": "customers"
17 |     },
18 |     {
19 |       "CatalogId": "588942721560",
20 |       "CreateTableDefaultPermissions": [
21 |         {
22 |           "Permissions": [
23 |             "ALL"
24 |           ],
25 |           "Principal": {
26 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
27 |           }
28 |         }
29 |       ],
30 |       "CreateTime": "2021-08-27 09:29:58+02:00",
31 |       "Name": "sportstickets"
32 |     },
33 |     {
34 |       "CatalogId": "588942721560",
35 |       "CreateTableDefaultPermissions": [
36 |         {
37 |           "Permissions": [
38 |             "ALL"
39 |           ],
40 |           "Principal": {
41 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
42 |           }
43 |         }
44 |       ],
45 |       "CreateTime": "2021-08-11 15:43:31+02:00",
46 |       "Description": "The Main Glue Database for the data Mesh",
47 |       "Name": "test_db",
48 |       "Parameters": {}
49 |     }
50 |   ],
51 |   "ResponseMetadata": {
52 |     "HTTPHeaders": {
53 |       "connection": "keep-alive",
54 |       "content-length": "795",
55 |       "content-type": "application/x-amz-json-1.1",
56 |       "date": "Fri, 03 Sep 2021 15:59:23 GMT",
57 |       "x-amzn-requestid": "7e2ba307-a94f-49e9-8c64-c5c283eae938"
58 |     },
59 |     "HTTPStatusCode": 200,
60 |     "RequestId": "7e2ba307-a94f-49e9-8c64-c5c283eae938",
61 |     "RetryAttempts": 0
62 |   }
63 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_gc_get_table_rsp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Table": {
 3 |         "Name": "person",
 4 |         "DatabaseName": "test_db",
 5 |         "Owner": "owner",
 6 |         "CreateTime": "2021-09-04 23:22:51+02:00",
 7 |         "UpdateTime": "2021-09-05 13:19:54+02:00",
 8 |         "LastAccessTime": "2021-09-05 13:19:54+02:00",
 9 |         "Retention": 0,
10 |         "StorageDescriptor": {
11 |             "Columns": [
12 |                 {
13 |                     "Name": "id",
14 |                     "Type": "double"
15 |                 },
16 |                 {
17 |                     "Name": "full_name",
18 |                     "Type": "string"
19 |                 }
20 |             ],
21 |             "Location": "s3://glue-job-test-destination-bucket/person/",
22 |             "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
23 |             "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
24 |             "Compressed": false,
25 |             "NumberOfBuckets": -1,
26 |             "SerdeInfo": {
27 |                 "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
28 |                 "Parameters": {
29 |                     "serialization.format": "1"
30 |                 }
31 |             },
32 |             "BucketColumns": [],
33 |             "SortColumns": [],
34 |             "Parameters": {
35 |                 "CrawlerSchemaDeserializerVersion": "1.0",
36 |                 "CrawlerSchemaSerializerVersion": "1.0",
37 |                 "UPDATED_BY_CRAWLER": "customer_person",
38 |                 "averageRecordSize": "89",
39 |                 "classification": "parquet",
40 |                 "compressionType": "none",
41 |                 "objectCount": "7",
42 |                 "recordCount": "100",
43 |                 "sizeKey": "12647",
44 |                 "typeOfData": "file"
45 |             },
46 |             "StoredAsSubDirectories": false
47 |         },
48 |         "PartitionKeys": [
49 |             {
50 |                 "Name": "gender",
51 |                 "Type": "string"
52 |             },
53 |             {
54 |                 "Name": "age",
55 |                 "Type": "string"
56 |             }
57 |         ],
58 |         "TableType": "EXTERNAL_TABLE",
59 |         "Parameters": {
60 |             "CrawlerSchemaDeserializerVersion": "1.0",
61 |             "CrawlerSchemaSerializerVersion": "1.0",
62 |             "UPDATED_BY_CRAWLER": "customer_person",
63 |             "averageRecordSize": "89",
64 |             "classification": "parquet",
65 |             "compressionType": "none",
66 |             "objectCount": "7",
67 |             "recordCount": "100",
68 |             "sizeKey": "12647",
69 |             "typeOfData": "file"
70 |         },
71 |         "CreatedBy": "arn:aws:sts::588942721560:assumed-role/AWSGlueServiceRole-crawler/AWS-Crawler",
72 |         "IsRegisteredWithLakeFormation": false,
73 |         "CatalogId": "588942721560"
74 |     },
75 |     "ResponseMetadata": {
76 |         "RequestId": "84d46abf-4cf0-4b08-91b4-42c855ffeac6",
77 |         "HTTPStatusCode": 200,
78 |         "HTTPHeaders": {
79 |             "date": "Sun, 05 Sep 2021 15:16:50 GMT",
80 |             "content-type": "application/x-amz-json-1.1",
81 |             "content-length": "1606",
82 |             "connection": "keep-alive",
83 |             "x-amzn-requestid": "84d46abf-4cf0-4b08-91b4-42c855ffeac6"
84 |         },
85 |         "RetryAttempts": 0
86 |     }
87 | }


--------------------------------------------------------------------------------
/tests/assets/integration/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: person_pii
 4 |     version: "1.0.0"
 5 |     name: xxxxxxx
 6 |     description: A person, who can be a customer, including PII
 7 |     columns:
 8 |       - id: id
 9 |         type: integer
10 |         constraints:
11 |           - type: unique
12 |           - type: not_null
13 |       - id: first_name
14 |         type: string
15 |       - id: last_name
16 |         type: string
17 |       - id: full_name
18 |         type: string
19 |       - id: gender
20 |         type: string
21 |         constraints:
22 |           - type: not_null
23 |           - type: regexp
24 |             options:
25 |               value: '^male|female$'
26 |       - id: age
27 |         type: integer
28 |     meta:
29 |       contains_pii: true
30 |     storage:
31 |       type: lake
32 |       format: parquet
33 | #      location: 'glue-job-test-destination-bucket/person_pii'
34 |       options:
35 |         skip_first_row: true
36 |         partition_by:
37 |           - gender
38 |           - age
39 |         bucketed_at: 512M
40 |     tags:
41 |       cost_center: 123455
42 |       use_case: Customer 360
43 |     access:
44 |       domain: customer_support
45 |       confidentiality: private
46 | 
47 |   - id: person_pub
48 |     version: "1.0.0"
49 |     description: public personal data
50 |     extends: person_pii
51 |     columns:
52 |       - id: full_name
53 |         transform:
54 |           - type: encrypt
55 |       - id: first_name
56 |         transform:
57 |           - type: skip
58 |       - id: last_name
59 |         transform:
60 |           - type: skip
61 |       - id: age
62 |         type: string
63 |         transform:
64 |           - type: bucketize
65 |             options:
66 |               buckets:
67 |                 0: 0-19
68 |                 20: 20-39
69 |                 40: 40+
70 |     meta:
71 |       contains_pii: false
72 |     storage:
73 |       type: lake
74 | #      location: 'glue-job-test-destination-bucket/person_pub'
75 |       format: parquet
76 |       options:
77 |         skip_first_row: true
78 |         partition_by:
79 |           - gender
80 |           - age
81 |         bucketed_at: 512M
82 |     tags:
83 |       cost_center: 123455
84 |       use_case: Customer 360
85 |     access:
86 |       domain: customer_support
87 |       confidentiality: public
88 | 


--------------------------------------------------------------------------------
/tests/assets/integration/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: product_a_customers
 4 |   version: "1.0.0"
 5 |   defaults:
 6 |       storage:
 7 |         location:
 8 |       engine: glue
 9 |   owner: jane.doe@acme.com
10 |   name: Customers
11 |   description: All customer data
12 |   pipeline:
13 |     schedule: "0 */1 * * *"
14 |     tasks:
15 |       - id: extract customer data
16 |         logic:
17 |           module: tasks.custom_business_logic
18 |           parameters:
19 |             create_timestamp: true
20 |         inputs:
21 |           - connection: test_db_connection
22 |             table: dms_sample.person_relevant
23 |         outputs:
24 |           - model: person_pii
25 |           - model: person_pub
26 | #      - id: save anonymized version of customers
27 | #        inputs:
28 | #          - model:
29 | #        outputs:
30 | #          - model: person_pub
31 | 


--------------------------------------------------------------------------------
/tests/assets/integration/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration/tasks/custom_business_logic.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import datetime
 3 | from typing import List
 4 | 
 5 | from pyspark.sql.functions import concat, col, lit, unix_timestamp
 6 | from driver.common import find_dataset_by_id
 7 | from driver.task_executor import DataSet
 8 | 
 9 | 
10 | def execute(inp_dfs: List[DataSet], create_timestamp=False):
11 |     ds = find_dataset_by_id(inp_dfs, 'person_relevant')
12 | 
13 |     if create_timestamp:
14 |         timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
15 |         ds.df = ds.df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
16 | 
17 |     df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name')))
18 | 
19 |     ds_pub = DataSet(id='person_pub', df=df)
20 |     ds_pii = DataSet(id='person_pii', df=df)
21 | 
22 | 
23 |     return [ds_pub, ds_pii]


--------------------------------------------------------------------------------
/tests/assets/integration_calendar/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: calendar
 4 |     version: "1.0.0"
 5 |     name: some name
 6 |     description: Enriched sports calendar events
 7 |     columns:
 8 |       - id: id
 9 |         type: long
10 |         constraints:
11 |           - type: unique
12 |           - type: not_null
13 |       - id: start_date_time
14 |         type: timestamp
15 |       - id: sport_type_name
16 |         type: string
17 |       - id: away_team
18 |         type: string
19 |       - id: home_team
20 |         type: string
21 |       - id: location
22 |         type: string
23 | 
24 |     meta:
25 |       contains_pii: true
26 |       steward: jane.doe@acme.com
27 |     storage:
28 |       type: lake
29 |       location: '/glue-job-test-destination-bucket/calendar'
30 |       options:
31 |         skip_first_row: true
32 |         partition_by:
33 |           - sport_type_name
34 |         bucketed_at: 512M
35 |         stored_as: parquet
36 |     tags:
37 |       cost_center: 123455
38 |       use_case: Customer 360
39 |     access:
40 |       domain: customer_support
41 |       confidentiality: private
42 | 


--------------------------------------------------------------------------------
/tests/assets/integration_calendar/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: calendar
 4 |   version: "1.0.0"
 5 |   owner: jane.doe@acme.com
 6 |   name: Customers
 7 |   description: All customer data
 8 |   pipeline:
 9 |     schedule: "0 3 * * *"
10 |     tasks:
11 |       - id: aggregate_events
12 |         engine: glue
13 |         logic:
14 |           module: tasks.custom_aggregate_events
15 |         inputs:
16 |           - model: events
17 |           - model: teams
18 |           - model: locations
19 | 


--------------------------------------------------------------------------------
/tests/assets/integration_file/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: industry
 4 |     version: "1.0"
 5 |     name: industry report file
 6 |     description: a random industry report file ingested from the internet
 7 |     columns:
 8 |       - id: year
 9 |         type: integer
10 |         constraints:
11 |           - type: not_null
12 |       - id: industry_code_ANZSIC
13 |         name: Industry Code
14 |         description: the code of the industry
15 |         type: string
16 |       - id: industry_name_ANZSIC
17 |         type: string
18 |       - id: rme_size_grp
19 |         type: string
20 |       - id: variable
21 |         type: string
22 |       - id: value
23 |         type: string
24 |       - id: unit
25 |         type: string
26 |     meta:
27 |       contains_pii: true
28 |       steward: jane.doe@acme.com
29 |     tags:
30 |       cost_center: 123455
31 |       use_case: Customer 360
32 |     access:
33 |       domain: customer_support
34 |       confidentiality: private


--------------------------------------------------------------------------------
/tests/assets/integration_file/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: some_data_product
 4 |   owner: jane@acme.com
 5 |   description: some description is required
 6 |   defaults:
 7 |     storage:
 8 |       options:
 9 |         compression: gzip
10 |   version: "1.0.0"
11 |   engine: glue
12 |   pipeline:
13 |     schedule: "0 3 * * *"
14 |     tasks:
15 |       - id: process_some_files
16 |         logic:
17 |           module: builtin.ingest
18 |           parameters:
19 |             create_timestamp: true
20 |         inputs:
21 |           - file: s3://datalakebucket-588942721560/csvs/annual-enterprise-survey-2020-financial-year-provisional-size-bands-csv.csv
22 |             options:
23 |               type: csv
24 |               infer_schema: true
25 |               separator: ','
26 |               header: true
27 |         outputs:
28 |           - model: industry
29 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration_sport_events/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/model.yml:
--------------------------------------------------------------------------------
  1 | models:
  2 |   - id: event
  3 |     version: "1.0.0"
  4 |     description: Sport event
  5 |     columns:
  6 |       - id: id
  7 |         type: long
  8 |         constraints:
  9 |           - type: unique
 10 |           - type: not_null
 11 |       - id: sport_type_name
 12 |         type: string
 13 |       - id: home_team_id
 14 |         type: integer
 15 |       - id: away_team_id
 16 |         type: integer
 17 |       - id: location_id
 18 |         type: short
 19 |       - id: start_date_time
 20 |         type: timestamp
 21 |       - id: start_date
 22 |         type: date
 23 |         transform:
 24 |           - type: skip
 25 |       - id: sold_out
 26 |         type: short
 27 |         transform:
 28 |           - type: skip
 29 | 
 30 |     meta:
 31 |       contains_pii: true
 32 |       steward: jane.doe@acme.com
 33 |     storage:
 34 |       type: lake
 35 |       options:
 36 |         skip_first_row: true
 37 |         partition_by:
 38 |           - sport_type_name
 39 |         bucketed_at: 512M
 40 |         stored_as: parquet
 41 |         location: 's3a://glue-job-test-destination-bucket/sport_event'
 42 |     tags:
 43 |       cost_center: 123455
 44 |       use_case: Customer 360
 45 |     access:
 46 |       domain: customer_support
 47 |       confidentiality: private
 48 | 
 49 |   - id: location
 50 |     version: "1.0.0"
 51 |     description: Sport Location
 52 |     columns:
 53 |       - id: id
 54 |         type: integer
 55 |         constraints:
 56 |           - type: unique
 57 |           - type: not_null
 58 |       - id: name
 59 |         type: string
 60 |       - id: city
 61 |         type: string
 62 |       - id: seating_capacity
 63 |         type: integer
 64 |       - id: levels
 65 |         type: integer
 66 |       - id: sections
 67 |         type: integer
 68 | 
 69 |     meta:
 70 |       contains_pii: true
 71 |       steward: jane.doe@acme.com
 72 |     storage:
 73 |       type: lake
 74 |       options:
 75 |         skip_first_row: true
 76 |         partition_by:
 77 |           - levels
 78 |         bucketed_at: 512M
 79 |         stored_as: parquet
 80 |         location: 's3a://glue-job-test-destination-bucket/location'
 81 |     tags:
 82 |       cost_center: 123455
 83 |       use_case: Customer 360
 84 |     access:
 85 |       domain: customer_support
 86 |       confidentiality: private
 87 | 
 88 |   - id: team
 89 |     version: "1.0.0"
 90 |     description: Sport Team
 91 |     columns:
 92 |       - id: id
 93 |         type: integer
 94 |         constraints:
 95 |           - type: unique
 96 |           - type: not_null
 97 |       - id: name
 98 |         type: string
 99 |       - id: abbreviated_name
100 |         type: string
101 |       - id: home_field_id
102 |         type: integer
103 |       - id: sport_type_name
104 |         type: string
105 |       - id: sport_league_short_name
106 |         type: string
107 |       - id: sport_division_short_name
108 |         type: string
109 | 
110 |     meta:
111 |       contains_pii: true
112 |       steward: jane.doe@acme.com
113 |     storage:
114 |       type: lake
115 |       options:
116 |         skip_first_row: true
117 |         partition_by:
118 |           - sport_league_short_name
119 |           - sport_division_short_name
120 |         bucketed_at: 512M
121 |         stored_as: parquet
122 |         location: 's3a://glue-job-test-destination-bucket/team'
123 |     tags:
124 |       cost_center: 123455
125 |       use_case: Customer 360
126 |     access:
127 |       domain: customer_support
128 |       confidentiality: private
129 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/product.yml:
--------------------------------------------------------------------------------
 1 | product:
 2 |   id: sport_events
 3 |   version: "1.0.0"
 4 |   description: Sporting Events
 5 |   pipeline:
 6 |     schedule: "0 3 * * *"
 7 |     tasks:
 8 |       - id: extract_sport_events
 9 |         logic:
10 |           module: builtin.ingest
11 |         input:
12 |           - id: events
13 |             type: connection
14 |             connection_id: test_db_connection
15 |             table: dms_sample.sporting_event
16 |             model: event
17 |       - id: extract_locations
18 |         logic:
19 |           module: builtin.ingest
20 |         input:
21 |           - id: locations
22 |             type: connection
23 |             connection_id: test_db_connection
24 |             table: dms_sample.sport_location_int
25 |             model: location
26 |       - id: extract_teams
27 |         logic:
28 |           module: builtin.ingest
29 |         input:
30 |           - id: teams
31 |             type: connection
32 |             connection_id: test_db_connection
33 |             table: dms_sample.sport_team_int
34 |             model: team
35 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration_sport_events/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/tasks/custom_aggregate_events.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pyspark.sql.functions import col
 4 | 
 5 | from driver.common import find_dataset_by_id
 6 | from driver.core import DataSet
 7 | 
 8 | 
 9 | def execute(inp_dfs: List[DataSet]):
10 |     events = find_dataset_by_id(inp_dfs, 'events').df.alias('events')
11 |     teams = find_dataset_by_id(inp_dfs, 'teams').df.alias('teams')
12 |     locations = find_dataset_by_id(inp_dfs, 'locations').df.alias('locations')
13 | 
14 |     events = events.join(locations, on=events.location_id == locations.id).select('events.*', col("locations.name").alias('location'))
15 |     events = events.join(teams, on=events.home_team_id == teams.id).select('events.*', col('location'), col("teams.name").alias('home_team'))
16 |     events = events.join(teams, on=events.away_team_id == teams.id).select('events.*', col('location'), col('home_team'), col("teams.name").alias('away_team'))
17 | 
18 |     events = events.drop(col('location_id'))
19 |     events = events.drop(col('home_team_id'))
20 |     events = events.drop(col('away_team_id'))
21 | 
22 |     output_ds = DataSet(id='calendar', df=events)
23 |     return [output_ds]
24 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person
 7 |     version: "1.0"
 8 |     columns:
 9 |       - id: id
10 |         type: integer
11 |         constraints:
12 |           - type: unique
13 |           - type: not_null
14 |       - id: first_name
15 |         type: string
16 |       - id: last_name
17 |         type: string
18 |       - id: age
19 |         type: integer
20 |       - id: city
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |     meta:
25 |       contains_pii: true
26 |       steward: jane.doe@acme.com
27 |     tags:
28 |       cost_center: 123455
29 |       use_case: Customer 360
30 |     access:
31 |       domain: customer_support
32 |       confidentiality: private
33 |   - id: transaction
34 |     version: "1.0"
35 |     columns:
36 |       - id: id
37 |         type: integer
38 |         constraints:
39 |           - type: unique
40 |           - type: not_null
41 |       - id: sku
42 |         type: string
43 |       - id: trx_date
44 |         type: timestamp
45 |       - id: geo
46 |         type: string
47 |       - id: items
48 |         type: integer
49 |     meta:
50 |       contains_pii: false
51 |       steward: jane.doe@acme.com
52 |     tags:
53 |       cost_center: 123455
54 |       use_case: Customer 360
55 |     access:
56 |       domain: customer_support
57 |       confidentiality: private
58 |   - id: ratings
59 |     version: "1.0"
60 |     columns:
61 |       - id: userId
62 |         type: integer
63 |       - id: movieId
64 |         type: integer
65 |       - id: rating
66 |         type: integer
67 |       - id: timestamp
68 |         type: long
69 |     meta:
70 |       contains_pii: false
71 |       steward: jane.doe@acme.com
72 |     tags:
73 |       cost_center: 123455
74 |       use_case: Customer 360
75 |     access:
76 |       domain: customer_support
77 |       confidentiality: private
78 |   - id: movie
79 |     version: "1.0"
80 |     columns:
81 |       - id: movieId
82 |         type: integer
83 |       - id: title
84 |         type: string
85 |       - id: genres
86 |         type: string
87 |     meta:
88 |       contains_pii: false
89 |       steward: jane.doe@acme.com
90 |     tags:
91 |       cost_center: 123455
92 |       use_case: Customer 360
93 |     access:
94 |       domain: customer_support
95 |       confidentiality: private
96 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_compilation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person_pii
 7 |     version: "1.0.0"
 8 |     name: xxxxxxx
 9 |     description: A person, who can be a customer, including PII
10 |     columns:
11 |       - id: id
12 |         type: integer
13 |         constraints:
14 |           - type: unique
15 |           - type: not_null
16 |       - id: first_name
17 |         type: string
18 |       - id: last_name
19 |         type: string
20 |       - id: full_name
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |         constraints:
25 |           - type: not_null
26 |           - type: regexp
27 |             options:
28 |               value: '[Male|Female]'
29 |       - id: age
30 |         type: integer
31 |     meta:
32 |       contains_pii: true
33 |     storage:
34 |       type: lake
35 |       format: parquet
36 |       options:
37 |         skip_first_row: true
38 |         partition_by:
39 |           - gender
40 |           - age
41 |         bucketed_at: 512M
42 |     tags:
43 |       cost_center: 123455
44 |       use_case: Customer 360
45 |     access:
46 |       domain: customer_support
47 |       confidentiality: private
48 |   - id: person_pub
49 |     version: "1.0.0"
50 |     extends: person_pii
51 |     description: a dataset with anonymised and pseudoanonimised columns
52 |     columns:
53 |       - id: full_name
54 |         transform:
55 |           - type: encrypt
56 |       - id: first_name
57 |         transform:
58 |           - type: skip
59 |       - id: last_name
60 |         transform:
61 |           - type: skip
62 |       - id: age
63 |         type: string
64 |         transform:
65 |           - type: bucketize
66 |             options:
67 |               buckets:
68 |                 0: 0-19
69 |                 20: 20-39
70 |                 40: 40+
71 |     meta:
72 |       contains_pii: false
73 |     storage:
74 |       type: lake
75 |       location: 'glue-job-test-destination-bucket/person_pub'
76 |       options:
77 |         skip_first_row: true
78 |         partition_by:
79 |           - gender
80 |           - age
81 |         bucketed_at: 512M
82 |         stored_as: parquet
83 |     tags:
84 |       cost_center: 123455
85 |       use_case: Customer 360
86 |     access:
87 |       domain: customer_support
88 |       confidentiality: public
89 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_correct.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person_pii
 7 |     version: "1.0.0"
 8 |     name: xxxxxxx
 9 |     description: straightforward model configuration
10 |     columns:
11 |       - id: id
12 |         type: integer
13 |         constraints:
14 |           - type: unique
15 |           - type: not_null
16 |       - id: first_name
17 |         type: string
18 |       - id: last_name
19 |         type: string
20 |       - id: full_name
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |         constraints:
25 |           - type: not_null
26 |           - type: regexp
27 |             options:
28 |               value: '[Male|Female]'
29 |       - id: age
30 |         type: integer
31 |     meta:
32 |       contains_pii: true
33 |     storage:
34 |       type: lake
35 |       location: 'glue-job-test-destination-bucket/person_pii'
36 |       format: parquet
37 |       options:
38 |         skip_first_row: true
39 |         partition_by:
40 |           - gender
41 |           - age
42 |         bucketed_at: 512M
43 |     tags:
44 |       cost_center: 123455
45 |       use_case: Customer 360
46 |     access:
47 |       domain: customer_support
48 |       confidentiality: private
49 | 
50 |   - id: person_pub
51 |     version: "1.0.0"
52 |     extends: person_pii
53 |     description: a dataset with anonymised and pseudoanonimised columns
54 |     columns:
55 |       - id: full_name
56 |         transform:
57 |           - type: encrypt
58 |       - id: first_name
59 |         transform:
60 |           - type: skip
61 |       - id: last_name
62 |         transform:
63 |           - type: skip
64 |       - id: age
65 |         type: string
66 |         transform:
67 |           - type: bucketize
68 |             options:
69 |               buckets:
70 |                 0: 0-19
71 |                 20: 20-39
72 |                 40: 40+
73 |     meta:
74 |       contains_pii: false
75 |     storage:
76 |       type: lake
77 |       location: 'glue-job-test-destination-bucket/person_pub'
78 |       options:
79 |         skip_first_row: true
80 |         partition_by:
81 |           - gender
82 |           - age
83 |         bucketed_at: 512M
84 |         stored_as: parquet
85 |     tags:
86 |       cost_center: 123455
87 |       use_case: Customer 360
88 |     access:
89 |       domain: customer_support
90 |       confidentiality: public
91 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_remove_xtra_columns.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: movie
 7 |     version: "1.0"
 8 |     xtra_columns: raze
 9 |     validation: strict
10 |     columns:
11 |       - id: movieId
12 |         type: integer
13 |       - id: title
14 |         type: string
15 |       - id: genres
16 |         type: string
17 |     meta:
18 |       contains_pii: false
19 |       steward: jane.doe@acme.com
20 |     tags:
21 |       cost_center: 123455
22 |       use_case: Customer 360
23 |     access:
24 |       domain: customer_support
25 |       confidentiality: private
26 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_strict_validation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: movie
 7 |     version: "1.0"
 8 |     validation: strict
 9 |     columns:
10 |       - id: movieId
11 |         type: integer
12 |       - id: title
13 |         type: string
14 |       - id: genres
15 |         type: string
16 |     meta:
17 |       contains_pii: false
18 |       steward: jane.doe@acme.com
19 |     tags:
20 |       cost_center: 123455
21 |       use_case: Customer 360
22 |     access:
23 |       domain: customer_support
24 |       confidentiality: private
25 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: some_data_product
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process_some_files
15 |         inputs:
16 |           - connection: test_connection
17 |             table: some_schema.some_table
18 |             model: transaction
19 |         outputs:
20 |           - model: transaction
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_compilation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: fixture
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process fixtures
15 |         inputs:
16 |           - model: data_product_a.person
17 |           - model: data_product_b.movies
18 |           - model: ratings
19 |         outputs:
20 |           - model: transaction
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - connection: test_db_connection
20 |             table: persons
21 |             model: person
22 |         outputs:
23 |           - model: a
24 |           - model: b
25 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_all_models.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - model: a
20 |         outputs:
21 |           - model: b
22 |           - model: c
23 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_connection_w_model.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: default_extract
14 |         inputs:
15 |           - connection: connection_name
16 |             table: db_schema.db_table_name
17 |             model: person_pub
18 |         outputs:
19 |           - model: person_pub


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_missing_logic_params.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - model: a
20 |         outputs:
21 |           - model: b
22 |           - model: c
23 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_input_file.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: some_data_product
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process_some_files
15 |         inputs:
16 |           - file: s3://datalakebucke/some_folder/some_file
17 |           - model: person
18 |         outputs:
19 |           - model: person
20 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_missing_logic.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         inputs:
15 |           - connection: test_db_connection
16 |             table: persons
17 |             model: person
18 |         outputs:
19 |           - model: a
20 |           - model: b
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_wrong_engine.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: error
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: extract_customers
15 |         logic:
16 |           module: tasks.custom_business_logic
17 |           parameters:
18 |             create_timestamp: true
19 |         inputs:
20 |           - connection: test_db_connection
21 |             table: persons
22 |             model: person
23 |         outputs:
24 |           - model: a
25 |           - model: b
26 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_wrong_output.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - connection: test_db_connection
20 |             table: persons
21 |             model: person
22 |         outputs:
23 |           - model: person
24 |           - error: b
25 | 


--------------------------------------------------------------------------------
/tests/aws/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/tests/aws/test_datalake.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | from driver.aws.datalake_api import Partition
 6 | from driver.aws.resolvers import reshuffle_partitions
 7 | 
 8 | 
 9 | def test_partitions():
10 |     ps = ['gender=Female/age=20-39', 'gender=Male/age=0-19', 'gender=Female/age=40+', 'gender=Female/age=0-19',
11 |           'gender=Male/age=20-39', 'gender=Male/age=40+']
12 |     partitions = list()
13 |     for p in ps:
14 |         po = Partition(p)
15 |         partitions.append(po)
16 |     for print_p in partitions:
17 |         print(str(print_p))
18 |     part_dict = reshuffle_partitions(prefix='s3a://glue-job-test-destination-bucket/', partitions=partitions)
19 |     print(json.dumps(part_dict, indent=4))
20 | 
21 |     if not(len(partitions) > 0): raise AssertionError


--------------------------------------------------------------------------------
/tests/catalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/catalog/__init__.py


--------------------------------------------------------------------------------
/tests/catalog/test_catalog.py:
--------------------------------------------------------------------------------
 1 | from boto3.session import Session
 2 | from pyspark.sql import DataFrame
 3 | from pytest import fixture
 4 | from pyspark.sql.types import (
 5 |     StringType,
 6 |     StructField,
 7 |     StructType,
 8 |     IntegerType,
 9 | )
10 | from driver.task_executor import DataSet
11 | from driver import ConfigContainer
12 | from unittest import skip
13 | 
14 | 
15 | @fixture
16 | def person_frame(spark_session) -> DataFrame:
17 |     return spark_session.createDataFrame(
18 |         [
19 |             (1, "Joe", "Average", 22),
20 |             (2, "Max", "Mustermann", 45),
21 |         ],
22 |         StructType(
23 |             [
24 |                 StructField("id", IntegerType(), True),
25 |                 StructField("first_name", StringType(), True),
26 |                 StructField("last_name", StringType(), True),
27 |                 StructField("age", IntegerType(), True)
28 |             ]
29 |         ),
30 |     )
31 | 
32 | 
33 | @skip("Integration test is skipped for now")
34 | def test_update(person_frame: DataFrame):
35 |     catalog_service = CatalogService(Session(profile_name='finn'))
36 | 
37 |     catalog_service.drain_database('customers')
38 | 
39 |     catalog_service.update_database('customers', 'person', DataSet(
40 |         id='person',
41 |         df=person_frame,
42 |         product_id='customers',
43 |         model_id='person',
44 |         model=ConfigContainer(
45 |             storage=ConfigContainer(
46 |                 options=ConfigContainer(
47 |                     location='s3://job-interpreter/data/customers'
48 |                 )
49 |             )
50 |         )
51 |     ))
52 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import datetime
  5 | import os
  6 | from driver import ConfigContainer
  7 | 
  8 | from pyspark.sql import DataFrame
  9 | from pytest import fixture
 10 | from pyspark.sql.types import (
 11 |     StringType,
 12 |     StructField,
 13 |     StructType,
 14 |     IntegerType,
 15 |     LongType,
 16 |     DoubleType, TimestampType
 17 | )
 18 | 
 19 | from driver.util import compile_product, compile_models
 20 | 
 21 | DEFAULT_BUCKET = 's3://test-bucket'
 22 | 
 23 | 
 24 | @fixture(scope='module')
 25 | def fixture_asset_path():
 26 |     cwd_path = os.path.dirname(os.path.abspath(__file__))
 27 |     return os.path.join(cwd_path, 'assets', 'metafiles')
 28 | 
 29 | 
 30 | @fixture(scope='module')
 31 | def app_args() -> ConfigContainer:
 32 |     args = ConfigContainer()
 33 |     setattr(args, 'default_data_lake_bucket', DEFAULT_BUCKET)
 34 |     return args
 35 | 
 36 | @fixture(scope='module')
 37 | def movie_schema() -> StructType:
 38 |     return StructType([
 39 |         StructField('movieId', IntegerType(), True),
 40 |         StructField('title', StringType(), True),
 41 |         StructField('genres', StringType(), True)
 42 |     ])
 43 | 
 44 | 
 45 | @fixture(scope='module')
 46 | def ratings_schema() -> StructType:
 47 |     return StructType([
 48 |         StructField('userId', IntegerType(), True),
 49 |         StructField('movieId', IntegerType(), True),
 50 |         StructField('rating', IntegerType(), True),
 51 |         StructField('timestamp', LongType(), True)
 52 |     ])
 53 | 
 54 | 
 55 | @fixture(scope='module')
 56 | def result_schema() -> StructType:
 57 |     return StructType([
 58 |         StructField('title', StringType(), True),
 59 |         StructField('weight_avg', DoubleType(), True),
 60 |         StructField('num_votes', IntegerType(), True)
 61 |     ])
 62 | 
 63 | 
 64 | @fixture(scope='module')
 65 | def movies_df(spark_session, movie_schema) -> DataFrame:
 66 |     return spark_session.createDataFrame([(1, 'Jumanji(1995)', 'Adventure | Children | Fantasy'),
 67 |                                           (2, 'Heat (1995)', 'Action|Crime|Thriller')],
 68 |                                          movie_schema)
 69 | 
 70 | 
 71 | @fixture(scope='module')
 72 | def ratings_df(spark_session, ratings_schema) -> DataFrame:
 73 |     return spark_session.createDataFrame([(1, 1, 4, 1256677221),
 74 |                                           (2, 1, 4, 1256677222),
 75 |                                           (3, 1, 1, 1256677222),
 76 |                                           (4, 2, 4, 1256677222)
 77 |                                           ], ratings_schema)
 78 | 
 79 | 
 80 | @fixture(scope='module')
 81 | def person_schema() -> StructType:
 82 |     return StructType([
 83 |         StructField('id', IntegerType(), False),
 84 |         StructField('first_name', StringType(), True),
 85 |         StructField('last_name', StringType(), True),
 86 |         StructField('age', IntegerType(), True),
 87 |         StructField('city', StringType(), True),
 88 |         StructField('gender', StringType(), True),
 89 |     ])
 90 | 
 91 | 
 92 | @fixture(scope='module')
 93 | def person_df(spark_session, person_schema) -> DataFrame:
 94 |     return spark_session.createDataFrame([(1, "John", "Doe", 25, "Berlin", "Male"),
 95 |                                           (2, "Jane", "Doe", 41, "Berlin", "Female"),
 96 |                                           (3, "Maxx", "Mustermann", 30, "Berlin", "Male")
 97 |                                           ], person_schema)
 98 | 
 99 | 
100 | @fixture(scope='module')
101 | def transaction_schema() -> StructType:
102 |     return StructType([
103 |         StructField('id', IntegerType(), False),
104 |         StructField('sku', StringType(), True),
105 |         StructField('trx_date', TimestampType(), True),
106 |         StructField('geo', StringType(), True),
107 |         StructField('items', IntegerType(), True)
108 |     ])
109 | 
110 | 
111 | @fixture(scope='module')
112 | def transaction_df(spark_session, transaction_schema) -> DataFrame:
113 |     date_field = datetime.datetime.now()
114 |     return spark_session.createDataFrame([(1, "1234", date_field, "EMEA", 25),
115 |                                           (2, "1235", date_field, "EMEA", 41),
116 |                                           (3, "1236", date_field, "US", 30)
117 |                                           ], transaction_schema)
118 | 
119 | 
120 | @fixture(scope='module')
121 | def product(app_args, fixture_asset_path):
122 |     return compile_product(fixture_asset_path, app_args)
123 | 
124 | 
125 | @fixture(scope='module')
126 | def models(app_args, fixture_asset_path, product):
127 |     return compile_models(fixture_asset_path, product)
128 | 


--------------------------------------------------------------------------------
/tests/test_constraint_checkers.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import datetime
 5 | from driver import ConfigContainer
 6 | 
 7 | import pytest
 8 | from pyspark.sql.functions import lit, col
 9 | 
10 | from driver.core import ValidationException
11 | from driver.processors import past_validator, future_validator, unique_validator, regexp_validator, null_validator, \
12 |     freshness_validator
13 | 
14 | 
15 | def test_past_validator(spark_session, transaction_df):
16 |     past_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
17 |     past_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5))
18 |     past_validator(transaction_df, 'trx_date', ConfigContainer())
19 |     past_validator(transaction_df, 'trx_date')
20 |     with pytest.raises(ValidationException) as vex:
21 |         updf = transaction_df.withColumn('trx_date', lit(datetime.datetime.now() + datetime.timedelta(days=5)))
22 |         updf.show()
23 |         past_validator(updf, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
24 | 
25 | 
26 | def test_future_validator(spark_session, transaction_df):
27 |     transaction_df.show()
28 |     future_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
29 |     future_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5))
30 |     with pytest.raises(ValidationException):
31 |         future_validator(transaction_df, 'trx_date', ConfigContainer())
32 |     with pytest.raises(ValidationException):
33 |         future_validator(transaction_df, 'trx_date')
34 |     updf = transaction_df.withColumn('trx_date', lit(datetime.datetime.now() + datetime.timedelta(days=5)))
35 |     updf.show()
36 |     future_validator(updf, 'trx_date')
37 | 
38 | 
39 | def test_unique_validator(spark_session, transaction_df, transaction_schema):
40 |     unique_validator(transaction_df, 'sku')
41 |     with pytest.raises(ValidationException):
42 |         new_row = spark_session.createDataFrame([(4, "1236", datetime.datetime.now(), "US", 30)], transaction_schema)
43 |         appended = transaction_df.union(new_row)
44 |         appended.show()
45 |         unique_validator(appended, 'sku')
46 | 
47 | 
48 | def test_regexp_validator(spark_session, transaction_df, transaction_schema):
49 |     regexp_validator(transaction_df, 'geo', ConfigContainer(value='^EMEA|US$'))
50 |     with pytest.raises(ValidationException):
51 |         new_row = spark_session.createDataFrame([(4, "1237", datetime.datetime.now(), "APJ", 30)], transaction_schema)
52 |         appended = transaction_df.union(new_row)
53 |         regexp_validator(appended, 'geo', ConfigContainer(value='^EMEA|US$'))
54 | 
55 | 
56 | def test_null_validator(spark_session, transaction_df, transaction_schema):
57 |     null_validator(transaction_df, 'geo')
58 |     with pytest.raises(ValidationException):
59 |         new_row = spark_session.createDataFrame([(4, "1237", datetime.datetime.now(), None, 30)], transaction_schema)
60 |         appended = transaction_df.union(new_row)
61 |         null_validator(appended, 'geo')
62 | 
63 | 
64 | def test_freshness_validator(spark_session, transaction_df, transaction_schema):
65 |     freshness_validator(transaction_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes'))
66 |     with pytest.raises(ValidationException):
67 |         trx_date = datetime.datetime.now() - datetime.timedelta(minutes=10)
68 |         upd_df = transaction_df.withColumn("trx_date", lit(trx_date))
69 |         freshness_validator(upd_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes'))
70 | 
71 |     freshness_validator(transaction_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes', group_by='geo'))
72 | 
73 |     with pytest.raises(ValidationException):
74 |         trx_date = datetime.datetime.now() - datetime.timedelta(minutes=10)
75 |         new_row = spark_session.createDataFrame([(4, "1237", trx_date, "APJ", 30)], transaction_schema)
76 |         appended = transaction_df.union(new_row)
77 |         freshness_validator(appended, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes', group_by='geo'))
78 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | def test_resolve_data_set_id():
5 |     pass
6 | 


--------------------------------------------------------------------------------
/tests/test_df_schema_validator.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import pytest
 5 | from time import time
 6 | from datetime import datetime
 7 | from pyspark.sql import DataFrame
 8 | from pyspark.sql.functions import col, unix_timestamp, lit
 9 | from driver.core import DataSet, DataProduct, SchemaValidationException
10 | from driver.processors import schema_checker, razor
11 | from driver.util import compile_models, filter_list_by_id
12 | 
13 | 
14 | def test_df_schema_validator(movies_df: DataFrame, product, models):
15 |     movie_model = filter_list_by_id(models, 'movie')
16 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
17 |     ds = DataSet(id='movie', df=movies_df, model=movie_model, product=dp)
18 |     ds = schema_checker(ds)
19 | 
20 | 
21 | def test_df_schema_validator_missing_fields(movies_df: DataFrame, product, models):
22 |     movie_model = filter_list_by_id(models, 'movie')
23 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
24 |     ds = DataSet(id='movie', df=movies_df.drop(col('genres')), model=movie_model, product=dp)
25 |     with pytest.raises(SchemaValidationException):
26 |         ds = schema_checker(ds)
27 | 
28 | 
29 | def test_df_schema_validator_extra_fields_lazy(movies_df: DataFrame, product, models):
30 |     movie_model = filter_list_by_id(models, 'movie')
31 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
32 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
33 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
34 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
35 |     ds = schema_checker(ds)
36 |     df.show()
37 | 
38 | 
39 | def test_df_schema_validator_extra_fields_strict(movies_df: DataFrame, product, fixture_asset_path):
40 |     models = compile_models(fixture_asset_path, product, def_file_name='model_strict_validation.yml')
41 |     movie_model = filter_list_by_id(models, 'movie')
42 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
43 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
44 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
45 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
46 |     df.show()
47 |     with pytest.raises(SchemaValidationException) as exc:
48 |         ds = schema_checker(ds)
49 | 
50 | 
51 | def test_df_schema_validator_extra_fields_strict_with_razor(movies_df: DataFrame, product, fixture_asset_path):
52 |     models = compile_models(fixture_asset_path, product, def_file_name='model_remove_xtra_columns.yml')
53 |     movie_model = filter_list_by_id(models, 'movie')
54 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
55 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
56 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
57 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
58 |     razor(ds)
59 |     ds.df.show()
60 |     ds = schema_checker(ds)
61 | 


--------------------------------------------------------------------------------
/tests/test_model_compilation.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import os
  5 | 
  6 | import pytest
  7 | from jsonschema import ValidationError
  8 | 
  9 | from driver import util
 10 | from driver.core import ArtefactType
 11 | from driver.util import compile_product, compile_models, filter_list_by_id
 12 | from tests.conftest import DEFAULT_BUCKET
 13 | 
 14 | 
 15 | # @pytest.fixture
 16 | # def metadata_path():
 17 | #     cwd_path = os.path.dirname(os.path.abspath(__file__))
 18 | #     return os.path.join(cwd_path, 'assets', 'model_defs')
 19 | 
 20 | 
 21 | def test_basic_model_compilation(fixture_asset_path, app_args):
 22 |     product = compile_product(fixture_asset_path, app_args)
 23 |     models = compile_models(fixture_asset_path, product)
 24 |     assert product.engine == 'glue'
 25 |     assert product.id == 'some_data_product'
 26 |     assert product.owner == 'jane@acme.com'
 27 |     assert product.version == '1.0.0'
 28 |     assert product.defaults.storage.location == DEFAULT_BUCKET
 29 |     assert getattr(product, 'pipeline')
 30 |     assert getattr(product.pipeline, 'tasks')
 31 |     assert len(product.pipeline.tasks) == 1
 32 |     assert product.pipeline.tasks[0].id == 'process_some_files'
 33 |     assert len(product.pipeline.tasks[0].inputs) == 1
 34 |     assert len(product.pipeline.tasks[0].outputs) == 1
 35 |     inp = product.pipeline.tasks[0].inputs[0]
 36 |     assert hasattr(inp, 'connection')
 37 |     assert hasattr(inp, 'model')
 38 |     assert hasattr(inp, 'table')
 39 |     assert len(models) == 4
 40 | 
 41 | 
 42 | # def test_connection_with_model(metadata_path):
 43 | #     args = ConfigContainer()
 44 | #     product = compile_product(metadata_path, args, prod_def_filename='product_correct_connection_w_model.yml')
 45 | #     models = compile_models(metadata_path, product, def_file_name='model_correct.yml')
 46 | #     assert len(models) == 2
 47 | 
 48 | 
 49 | # def test_minimal_model_compilation(product, models):
 50 | #     assert models
 51 | 
 52 | 
 53 | def test_advanced_compilation_features(fixture_asset_path, app_args):
 54 |     # abs_product_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets', 'advanced_compilation')
 55 |     product = compile_product(fixture_asset_path, app_args, prod_def_filename='product_compilation.yml')
 56 |     models = compile_models(fixture_asset_path, product, def_file_name='model_compilation.yml')
 57 |     assert len(models) == 2
 58 |     person_pii = filter_list_by_id(models, 'person_pii')
 59 |     assert person_pii.storage.location == DEFAULT_BUCKET, 'The default bucket should be set on models with no explicit location'
 60 |     assert person_pii.storage.type == 'lake'
 61 |     person_pub = filter_list_by_id(models, 'person_pub')
 62 |     assert person_pub.storage.type == 'lake'
 63 |     pub_full_name_col = filter_list_by_id(person_pub.columns, 'full_name')
 64 |     assert pub_full_name_col.type == 'string', 'The String type should have been inherited from the pii model'
 65 |     assert pub_full_name_col.transform[
 66 |                0].type == 'encrypt', 'The Transform should have beein inherited from the pii model'
 67 |     pub_full_id_col = filter_list_by_id(person_pub.columns, 'id')
 68 |     assert pub_full_id_col, 'ID col should have been inherited from the pii model'
 69 |     assert pub_full_id_col.type == 'integer', 'ID col type should have been inherited from the pii model'
 70 |     gender = filter_list_by_id(person_pub.columns, 'gender')
 71 |     assert gender, 'The model should inherit the Genre column from the person pii model'
 72 | 
 73 | 
 74 | # def test_mode_extend_compilation_non_specified_field():
 75 | #     abs_product_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets')
 76 | #     args = ConfigContainer()
 77 | #     product = compile_product(abs_product_path, args)
 78 | #     models = compile_models(abs_product_path, product)
 79 | 
 80 | 
 81 | def test_model_schema_correct(fixture_asset_path):
 82 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'model_correct.yml'))
 83 |     util.validate_schema(product_def, ArtefactType.model)
 84 | 
 85 | 
 86 | def test_product_schema_correct(fixture_asset_path):
 87 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct.yml'))
 88 |     util.validate_schema(product_def, ArtefactType.product)
 89 | 
 90 | 
 91 | def test_product_schema_correct_with_models(fixture_asset_path):
 92 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct_all_models.yml'))
 93 |     util.validate_schema(product_def, ArtefactType.product)
 94 | 
 95 | 
 96 | def test_product_schema_wrong_engine(fixture_asset_path):
 97 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_wrong_engine.yml'))
 98 |     with pytest.raises(ValidationError) as vex:
 99 |         util.validate_schema(product_def, ArtefactType.product)
100 | 
101 | 
102 | def test_product_schema_output_err(fixture_asset_path):
103 |     # missing module parameters
104 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_wrong_output.yml'))
105 |     with pytest.raises(ValidationError) as vex:
106 |         util.validate_schema(product_def, ArtefactType.product)
107 | 
108 | 
109 | def test_product_missing_logic(fixture_asset_path):
110 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_missing_logic.yml'))
111 |     util.validate_schema(product_def, ArtefactType.product)
112 | 
113 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct_missing_logic_params.yml'))
114 |     util.validate_schema(product_def, ArtefactType.product)
115 | 
116 | 
117 | def test_connection_input_configuration(fixture_asset_path):
118 |     pass
119 | 
120 | 
121 | def test_model_input_configuration(fixture_asset_path):
122 |     pass
123 | 
124 | 
125 | def test_file_input_configuration(fixture_asset_path):
126 |     # product_input_file.yml
127 |     pass
128 | 


--------------------------------------------------------------------------------
/tests/test_task_executor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import driver
 5 | from driver import ConfigContainer
 6 | from pyspark.sql import DataFrame
 7 | from driver import DataSet
 8 | from driver.processors import schema_checker, constraint_processor, transformer_processor
 9 | 
10 | 
11 | def test_end_to_end(spark_session, transaction_df: DataFrame, fixture_asset_path, app_args):
12 |     dfs = {"some_schema.some_table": transaction_df}
13 | 
14 |     def mock_input_handler(props: ConfigContainer):
15 |         return dfs.get(props.table)
16 | 
17 |     def mock_output_handler(ds: DataSet):
18 |         assert ds.id == 'transaction'
19 |         assert ds.df.count() == transaction_df.count()
20 |         ds.df.show()
21 |         ds.df.describe()
22 | 
23 |     driver.init(spark_session)
24 |     driver.register_data_source_handler('connection', mock_input_handler)
25 |     driver.register_postprocessors(transformer_processor, schema_checker, constraint_processor)
26 |     driver.register_output_handler('default', mock_output_handler)
27 |     driver.register_output_handler('lake', mock_output_handler)
28 |     setattr(app_args, 'product_path', fixture_asset_path)
29 |     print('something')
30 |     driver.process_product(app_args, fixture_asset_path)
31 | 
32 | def test_resolve_io_type():
33 |     pass
34 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | def test_enrich_models(fixture_asset_path):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/version.sh:
--------------------------------------------------------------------------------
1 | export VERSION=1.0.4


--------------------------------------------------------------------------------