├── .github
    └── workflows
    │   ├── bump-version.yml
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── NOTICES.txt
├── README.md
├── README_dev.md
├── access
    └── policy_template.json
├── builtin
    ├── __init__.py
    └── ingest.py
├── docs
    ├── access-management.md
    ├── architecture.md
    ├── data-product-processor-arch.png
    ├── data-product-specification.md
    └── how-to
    │   ├── custom-dependencies.md
    │   ├── local-development.md
    │   └── transformation-logic.md
├── driver
    ├── __init__.py
    ├── aws
    │   ├── __init__.py
    │   ├── datalake_api.py
    │   ├── glue_api.py
    │   ├── providers.py
    │   └── resolvers.py
    ├── common.py
    ├── core.py
    ├── driver.py
    ├── io_handlers.py
    ├── packager.py
    ├── processors.py
    ├── schema
    │   └── 1.rc-1
    │   │   ├── model.json
    │   │   └── product.json
    ├── task_executor.py
    └── util.py
├── main.py
├── package.py
├── pytest.ini
├── requirements-test.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── assets
    │   ├── aws_api_rsps
    │   │   ├── aws_glue_dc_connection.json
    │   │   ├── glue_dc_get_db_rsp.json
    │   │   ├── glue_dc_get_db_rsps.json
    │   │   └── glue_gc_get_table_rsp.json
    │   ├── integration
    │   │   ├── model.yml
    │   │   ├── product.yml
    │   │   └── tasks
    │   │   │   ├── __init__.py
    │   │   │   └── custom_business_logic.py
    │   ├── integration_calendar
    │   │   ├── model.yml
    │   │   └── product.yml
    │   ├── integration_file
    │   │   ├── model.yml
    │   │   └── product.yml
    │   ├── integration_sport_events
    │   │   ├── __init__.py
    │   │   ├── model.yml
    │   │   ├── product.yml
    │   │   └── tasks
    │   │   │   ├── __init__.py
    │   │   │   └── custom_aggregate_events.py
    │   └── metafiles
    │   │   ├── model.yml
    │   │   ├── model_compilation.yml
    │   │   ├── model_correct.yml
    │   │   ├── model_remove_xtra_columns.yml
    │   │   ├── model_strict_validation.yml
    │   │   ├── product.yml
    │   │   ├── product_compilation.yml
    │   │   ├── product_correct.yml
    │   │   ├── product_correct_all_models.yml
    │   │   ├── product_correct_connection_w_model.yml
    │   │   ├── product_correct_missing_logic_params.yml
    │   │   ├── product_input_file.yml
    │   │   ├── product_missing_logic.yml
    │   │   ├── product_wrong_engine.yml
    │   │   └── product_wrong_output.yml
    ├── aws
    │   ├── __init__.py
    │   └── test_datalake.py
    ├── catalog
    │   ├── __init__.py
    │   └── test_catalog.py
    ├── conftest.py
    ├── test_constraint_checkers.py
    ├── test_core.py
    ├── test_df_schema_validator.py
    ├── test_model_compilation.py
    ├── test_task_executor.py
    └── test_util.py
└── version.sh


/.github/workflows/bump-version.yml:
--------------------------------------------------------------------------------
 1 | name: bump-version
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - major-*
 7 |       - minor-*
 8 |       - patch-*   
 9 |     
10 | permissions:
11 |   contents: write
12 |   pull-requests: write
13 | 
14 | jobs:
15 |   bump-version:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v3
19 |         with:
20 |           fetch-depth: 0
21 |           ref: ${{ github.ref }}
22 |       - name: set to major release
23 |         if: github.ref_type == 'tag' && startsWith(github.ref_name, 'major-')
24 |         run: |
25 |           echo "RELEASE_TYPE=major" >> $GITHUB_ENV
26 |       - name: set to minor release
27 |         if: github.ref_type == 'tag' && startsWith(github.ref_name, 'minor-')
28 |         run: |
29 |           echo "RELEASE_TYPE=minor" >> $GITHUB_ENV
30 |       - name: set to patch release
31 |         if: github.ref_type == 'tag' && startsWith(github.ref_name, 'patch-')
32 |         run: |
33 |           echo "RELEASE_TYPE=patch" >> $GITHUB_ENV
34 |       - id: bump2version
35 |         name: bump data product processor version
36 |         run: |
37 |           git config --global user.email "CI"
38 |           git config --global user.name "CI@users.noreply.github.com"
39 |           
40 |           pip install bump2version
41 |           . ./version.sh
42 |           echo $VERSION
43 |           bump2version --current-version $VERSION --tag-name '{new_version}' --tag ${RELEASE_TYPE}
44 |           # load new version in environment
45 |           . ./version.sh
46 |           echo $VERSION
47 |           echo "VERSION=$VERSION" >> $GITHUB_ENV
48 |           # push tag
49 |           git push origin refs/tags/${VERSION}
50 |       - name: create pull request
51 |         uses: peter-evans/create-pull-request@v4
52 |         with:
53 |           title: "release version: ${{ env.VERSION }}"
54 |           branch: release-${{ env.VERSION }}
55 |           delete-branch: true
56 |           base: main
57 |           labels: |
58 |             release
59 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "*"
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build-test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |         with:
15 |           fetch-depth: 0
16 |       - name: unit tests
17 |         run: |
18 |           pip install --upgrade pip
19 |           pip install -U -e  .
20 |           pip install -r requirements-test.txt
21 |           pytest --cov=deprecated -s -m 'not integration'
22 |       - id: bump2version
23 |         name: bump data product processor version to pre-release
24 |         run: |
25 |           pip install bump2version
26 |           . ./version.sh
27 |           echo "latest release: ${VERSION}"
28 |           head=$(git rev-parse --short HEAD)
29 |           commits_since_tag=$(git rev-list ${VERSION}..HEAD --count)
30 |           echo "new snapshot: ${VERSION}+${commits_since_tag}.${head}"
31 |           bump2version --new-version ${VERSION}+${commits_since_tag}.${head} pre
32 |       - name: build wheel
33 |         run: |
34 |           python setup.py build -vf && python setup.py bdist_wheel
35 |       - name: archive dist
36 |         uses: actions/upload-artifact@v3
37 |         with:
38 |           name: dpp-dist
39 |           retention-days: 30
40 |           path: |
41 |             ./dist/*
42 |             ./main.py
43 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: write
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     outputs:
15 |       version: ${{ steps.getversion.outputs.VERSION }}
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - id: getversion
19 |         name: get version
20 |         run: |
21 |           . ./version.sh
22 |           echo "VERSION=$VERSION" >> $GITHUB_ENV
23 |           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
24 |       - name: build wheel
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install -U -e  .
28 |           pip install -r requirements-test.txt
29 |           python setup.py build -vf && python setup.py bdist_wheel
30 |       - name: archive artifacts
31 |         uses: actions/upload-artifact@v3
32 |         with:
33 |           name: data-product-processor-${{ env.VERSION }}
34 |           retention-days: 30
35 |           path: |
36 |             ./dist/*
37 |             ./main.py
38 |   publish-test:
39 |     needs: build
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - uses: actions/download-artifact@v3
43 |         with:
44 |           name: data-product-processor-${{ needs.build.outputs.version }}
45 |       - name: publish
46 |         uses: pypa/gh-action-pypi-publish@release/v1
47 |         with:
48 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
49 |           repository_url: https://test.pypi.org/legacy/
50 |           print_hash: true
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | .DS_Store/
  7 | .DS_Store
  8 | *.py[cod]
  9 | *$py.class
 10 | pyrightconfig.json
 11 | # C extensions
 12 | *.so
 13 | .vscode
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | spark_deps/
118 | spark_deps
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | #IDE
139 | .idea/
140 | */glue-libs/
141 | 
142 | cdk/functions/config_validation/env
143 | cdk/functions/config_validation/package
144 | cdk/functions/config_validation/package.sh
145 | cdk/functions/config_validation/my-deployment-package.zip
146 | 
147 | # Logs
148 | logs
149 | *.log
150 | npm-debug.log*
151 | yarn-debug.log*
152 | yarn-error.log*
153 | lerna-debug.log*
154 | .pnpm-debug.log*
155 | 
156 | # Diagnostic reports (https://nodejs.org/api/report.html)
157 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
158 | 
159 | # Runtime data
160 | pids
161 | *.pid
162 | *.seed
163 | *.pid.lock
164 | 
165 | # Dependency directories
166 | node_modules/
167 | jspm_packages/
168 | 
169 | # Optional npm cache directory
170 | .npm
171 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include driver/schema/*/*.json


--------------------------------------------------------------------------------
/NOTICES.txt:
--------------------------------------------------------------------------------
  1 | dpac-data-product-processor
  2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 | 
  4 | **********************
  5 | THIRD PARTY COMPONENTS
  6 | **********************
  7 | 
  8 | Package: attrs
  9 | License: MIT
 10 | Requires: n/a
 11 | Author: Hynek Schlawack <hs@ox.cx>
 12 | Home page: https://www.attrs.org/
 13 | 
 14 | ----------------------------------------
 15 | 
 16 | Package: boto3
 17 | License: Apache-2.0
 18 | Requires: botocore, jmespath, s3transfer
 19 | Author: Amazon Web Services
 20 | Home page: https://github.com/boto/boto3
 21 | 
 22 | ----------------------------------------
 23 | 
 24 | Package: botocore
 25 | License: Apache-2.0
 26 | Requires: jmespath, python-dateutil, urllib3
 27 | Author: Amazon Web Services
 28 | Home page: https://github.com/boto/botocore
 29 | 
 30 | ----------------------------------------
 31 | 
 32 | Package: jmespath
 33 | License: MIT
 34 | Requires: n/a
 35 | Author: James Saryerwinnie <js@jamesls.com>
 36 | Home page: https://github.com/jmespath/jmespath.py
 37 | 
 38 | ----------------------------------------
 39 | 
 40 | Package: jsonschema
 41 | License: MIT
 42 | Requires: attrs, pyrsistent, setuptools, six
 43 | Author: Julian Berman <Julian@GrayVines.com>
 44 | Home page: https://github.com/Julian/jsonschema
 45 | 
 46 | ----------------------------------------
 47 | 
 48 | Package: mypy-boto3-glue
 49 | License: MIT
 50 | Requires: n/a
 51 | Author: Vlad Emelianov <vlad.emelianov.nz@gmail.com>
 52 | Home page: https://github.com/vemel/mypy_boto3_builder
 53 | 
 54 | ----------------------------------------
 55 | 
 56 | Package: pydantic
 57 | License: MIT
 58 | Requires: typing-extensions
 59 | Author: Samuel Colvin <s@muelcolvin.com>
 60 | Home page: https://github.com/samuelcolvin/pydantic
 61 | 
 62 | The MIT License (MIT)
 63 | 
 64 | ----------------------------------------
 65 | 
 66 | Package: pyrsistent
 67 | License: MIT
 68 | Requires: n/a
 69 | Author: Tobias Gustafsson <tobias.l.gustafsson@gmail.com>
 70 | Home page: http://github.com/tobgu/pyrsistent/
 71 | 
 72 | ----------------------------------------
 73 | 
 74 | Package: python-dateutil
 75 | License: Apache-2.0
 76 | Requires: six
 77 | Author: Gustavo Niemeyer <gustavo@niemeyer.net>
 78 | Home page: https://github.com/dateutil/dateutil
 79 | 
 80 | --------------------------------------------------------------------------------
 81 | dateutil - Extensions to the standard Python datetime module.
 82 | 
 83 | Copyright (c) 2003-2011 - Gustavo Niemeyer <gustavo@niemeyer.net>
 84 | Copyright (c) 2012-2014 - Tomi Pieviläinen <tomi.pievilainen@iki.fi>
 85 | Copyright (c) 2014-2016 - Yaron de Leeuw <me@jarondl.net>
 86 | Copyright (c) 2015-     - Paul Ganssle <paul@ganssle.io>
 87 | Copyright (c) 2015-     - dateutil contributors (see AUTHORS file)
 88 | 
 89 | All rights reserved.
 90 | 
 91 | Redistribution and use in source and binary forms, with or without
 92 | modification, are permitted provided that the following conditions are met:
 93 | 
 94 |     * Redistributions of source code must retain the above copyright notice,
 95 |       this list of conditions and the following disclaimer.
 96 |     * Redistributions in binary form must reproduce the above copyright notice,
 97 |       this list of conditions and the following disclaimer in the documentation
 98 |       and/or other materials provided with the distribution.
 99 |     * Neither the name of the copyright holder nor the names of its
100 |       contributors may be used to endorse or promote products derived from
101 |       this software without specific prior written permission.
102 | 
103 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
104 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
105 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
106 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
107 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
108 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
109 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
110 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
111 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
112 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
113 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
114 | 
115 | The above BSD License Applies to all code, even that also covered by Apache 2.0.
116 | 
117 | ----------------------------------------
118 | 
119 | Package: quinn
120 | License: Apache-2.0
121 | Requires: n/a
122 | Author: MrPowers <matthewkevinpowers@gmail.com>
123 | Home page: https://github.com/MrPowers/quinn/
124 | 
125 | ----------------------------------------
126 | 
127 | Package: s3transfer
128 | License: Apache-2.0
129 | Requires: botocore
130 | Author: Amazon Web Services <kyknapp1@gmail.com>
131 | Home page: https://github.com/boto/s3transfer
132 | 
133 | ----------------------------------------
134 | 
135 | Package: setuptools
136 | License: MIT
137 | Requires: n/a
138 | Author: Python Packaging Authority <distutils-sig@python.org>
139 | Home page: https://github.com/pypa/setuptools
140 | 
141 | ----------------------------------------
142 | 
143 | Package: six
144 | License: MIT
145 | Requires: n/a
146 | Author: Benjamin Peterson <benjamin@python.org>
147 | Home page: https://github.com/benjaminp/six
148 | 
149 | ----------------------------------------
150 | 
151 | Package: urllib3
152 | License: MIT
153 | Requires: n/a
154 | Author: Andrey Petrov <andrey.petrov@shazow.net>
155 | Home page: https://urllib3.readthedocs.io/
156 | 
157 | MIT License
158 | 
159 | Copyright (c) 2008-2020 Andrey Petrov and contributors (see CONTRIBUTORS.txt)
160 | 
161 | Permission is hereby granted, free of charge, to any person obtaining a copy
162 | of this software and associated documentation files (the "Software"), to deal
163 | in the Software without restriction, including without limitation the rights
164 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
165 | copies of the Software, and to permit persons to whom the Software is
166 | furnished to do so, subject to the following conditions:
167 | 
168 | The above copyright notice and this permission notice shall be included in all
169 | copies or substantial portions of the Software.
170 | 
171 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
172 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
173 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
174 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
175 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
176 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
177 | SOFTWARE.
178 | 
179 | ----------------------------------------
180 | 
181 | Package: wheel
182 | License: MIT
183 | Requires: n/a
184 | Author: Daniel Holth <dholth@fastmail.fm>
185 | Home page: https://github.com/pypa/wheel
186 | 
187 | ----------------------------------------
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # data product processor
 2 | 
 3 | The data product processor is a library for dynamically creating and executing Apache Spark Jobs based on a declarative description of a data product.
 4 | 
 5 | The declaration is based on YAML and covers input and output data stores as well as data structures. It can be augmented with custom, PySpark-based transformation logic.
 6 | 
 7 | ## Installation
 8 | **Prerequisites**
 9 | - Python 3.x
10 | - Apache Spark 3.x
11 | 
12 | **Install with pip**
13 | ```commandline
14 | pip install data-product-processor
15 | ```
16 | 
17 | ## Getting started
18 | ### Declare a basic data product
19 | Please see [Data product specification](docs/data-product-specification.md) for an overview on the files required to declare a data product.
20 | 
21 | ### Process the data product
22 | From folder in which the previously created file are stored, run the data-product-processor as follows:
23 | 
24 | ```commandline
25 | data-product-processor \
26 |   --default_data_lake_bucket some-datalake-bucket \
27 |   --aws_profile some-profile \
28 |   --aws_region eu-central-1 \
29 |   --local
30 | ```
31 | This command will run Apache Spark locally (due to the --local switch) and store the output on an S3 bucket (authenticated with the AWS profile used in the parameter).
32 | 
33 | If you want to run the library from a different folder than the data product declaration, reference the latter through the additional argument `--product_path`.
34 | ```commandline
35 | data-product-processor \
36 |   --product_path ../path-to-some-data-product \
37 |   --default_data_lake_bucket some-datalake-bucket \
38 |   --aws_profile some-profile \
39 |   --aws_region eu-central-1 \
40 |   --local
41 | ```
42 | 
43 | ## CLI Arguments
44 | ```commandline
45 | data-product-processor --help
46 | 
47 |   --JOB_ID - the unique id of this Glue/EMR job
48 |   --JOB_RUN_ID - the unique id of this Glue job run
49 |   --JOB_NAME - the name of this Glue job
50 |   --job-bookmark-option - job-bookmark-disable if you don't want bookmarking
51 |   --TempDir - tempoarary results directory
52 |   --product_path - the data product definition folder
53 |   --aws_profile - the AWS profile to be used for connection
54 |   --aws_region - the AWS region to be used
55 |   --local - local development
56 |   --jars - extra jars to be added to the Spark context
57 |   --additional-python-modules - this parameter is injected by Glue, currently it is not in use
58 |   --default_data_lake_bucket - a default bucket location (with s3a:// prefix)
59 | ```
60 | ## References
61 | - [Data product specification](docs/data-product-specification.md)
62 | - [Access management](docs/access-management.md)
63 | 
64 | ## Tutorials
65 | - [How to write and test custom transformation logic?](docs/how-to/transformation-logic.md)
66 | - [How to reference custom Spark dependencies?](docs/how-to/custom-dependencies.md)
67 | - [How to set up local development?](docs/how-to/local-development.md)
68 | 


--------------------------------------------------------------------------------
/README_dev.md:
--------------------------------------------------------------------------------
  1 | [![pipeline status](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/badges/master/pipeline.svg)](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/-/commits/master)
  2 | [![coverage report](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/badges/master/coverage.svg)](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-task-interpreter/-/commits/master)
  3 | 
  4 | # Data Mesh Task Interpreter
  5 | 
  6 | Interprets YAML based task definition of
  7 | the [data mesh](https://gitlab.aws.dev/aws-sa-dach/teams/dnb/data-mesh-solution) as AWS Glue job.
  8 | 
  9 | ## Format
 10 | 
 11 | See [model.yml](deprecated_ts/interpreters/model.yml) and [product.yml](deprecated_ts/interpreters/product.yml)
 12 | test examples.
 13 | 
 14 | # Setup real-local development environment
 15 | 
 16 | ## Install development environment on OSX
 17 | 
 18 | Everything will be installed in virtual environment in your local project folder.
 19 | 
 20 | ```bash
 21 | python3 -m venv .venv
 22 | source .venv/bin/activate
 23 | pip install -r requirements-test.txt
 24 | ```
 25 | 
 26 | Don't forget to switch the new virtual environment in your IDE too.
 27 | 
 28 | Building the wheel package:
 29 | 
 30 | ```commandline
 31 | pip install -U pip wheel setuptools
 32 | python3 setup.py bdist_wheel
 33 | ```
 34 | As a result you should see
 35 | 
 36 | Also: make sure Java is installed. On OSX:
 37 | 
 38 | ```bash
 39 | brew tap homebrew/cask-versions
 40 | brew update
 41 | brew tap  homebrew/cask
 42 | brew tap adoptopenjdk/openjdk
 43 | brew install --cask adoptopenjdk11
 44 | brew install maven
 45 | ```
 46 | 
 47 | Install spark dependencies:
 48 | 
 49 | ```bash
 50 | mkdir spark_deps
 51 | cd spark_deps
 52 | wget https://jdbc.postgresql.org/download/postgresql-42.2.23.jar
 53 | ```
 54 | 
 55 | Install the AWS dependencies for hadoop:
 56 | 
 57 | 1. check the current version of hadoop: ```ll -al .venv/lib/python3.9/site-packages/pyspark/jars |grep hadoop```
 58 | 2. create a POM file in the spark_deps folder (make sure the version field matches the current hadoop version):
 59 | 
 60 | ```xml
 61 | <project>
 62 |   <modelVersion>4.0.0</modelVersion>
 63 |   <groupId>com.mycompany.app</groupId>
 64 |   <artifactId>my-app</artifactId>
 65 |   <version>1</version>
 66 |     <dependencies>
 67 |         <dependency>
 68 |             <groupId>org.apache.hadoop</groupId>
 69 |             <artifactId>hadoop-aws</artifactId>
 70 |             <version>3.3.1</version>
 71 |         </dependency>
 72 |     </dependencies>
 73 | </project>
 74 | ```
 75 | 
 76 | Download the dependencies:
 77 | 
 78 | ```bash
 79 | mvn --batch-mode -f ./pom.xml -DoutputDirectory=./jars dependency:copy-dependencies
 80 | mv jars/* .
 81 | ```
 82 | 
 83 | Set the following parameters onto the execution context in your IDE:
 84 | 
 85 | ```commandline
 86 | --product_path /tests/assets/integration --default_data_lake_bucket <SOME_DATA_LAKE_BUCKEY> --aws_profile <your-aws-account-profile> --aws_region <your-region> --local
 87 | ```
 88 | 
 89 | Alternatively you can run the whole solution from the command line:
 90 | ```commandline
 91 | data-product-processor --JOB_NAME "TEST" --product_path /tests/assets/integration --default_data_lake_bucket <SOME_DATA_LAKE_BUCKEY> --aws_profile <your-aws-account-profile> --aws_region <your-region>
 92 | ```
 93 | 
 94 | Optionally you might need to export Spark Home if the Spark environment is not found in your installation.
 95 | 
 96 | ```commandline
 97 | export SPARK_HOME="$(pwd)/.venv/lib/python3.9/site-packages/pyspark"
 98 | ```
 99 | 
100 | Run the tests from command line (while the virtual environment is activated):
101 | 
102 | ```commandline
103 | pytest
104 | ```
105 | 
106 | ## Troubleshooting
107 | 
108 | On error:
109 | ```
110 | py4j.protocol.Py4JError: org.apache.spark.api.python.PythonUtils.getPythonAuthSocketTimeout does not exist in the JVM
111 | ```
112 | 
113 | Type this:
114 | ```commandline
115 | export PYTHONPATH="${SPARK_HOME}/python;${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip;${PYTHONPATH}"
116 | ```
117 | 
118 | # Issues
119 | 
120 | ## Sfl4j not found
121 | 
122 | ```commandline
123 | [NOT FOUND  ] org.slf4j#slf4j-api;1.7.5!slf4j-api.jar
124 | ```
125 | **Solution**
126 | Remove dir in .ivy2/cache, ivy2/jars and .m2/repository
127 | 
128 | ## CI/CD
129 | 
130 | The Gitlab based CI/CD pipeline can be dound at: [gitlab-ci.yml](.gitlab-ci.yml).
131 | 
132 | ## Setup local Spark playground
133 | 
134 | This is a description of an optional and somewhat unrelated step, for setting up an interactive development environment that helps to experiment with Spark concepts in a local environment.
135 | 
136 | Make sure that you execute these commands in a virtual environment (see the top of this document for instructions):
137 | 
138 | ```commandline
139 | pip install ptpython
140 | ptpython
141 | ```
142 | 
143 | Type the following in the ptpython console:fs.s3a.aws.credentials.provider
144 | 
145 | [optional] only if you encounter errors with the larger snippet bellow:
146 | ```python
147 | import findspark
148 | findspark.init()
149 | ```
150 | Interactive development:
151 | ```python
152 | import sys
153 | import os
154 | from pyspark import SparkConf
155 | from pyspark.sql import SparkSession
156 | from pyspark.sql.types import (
157 |     StringType,
158 |     StructField,
159 |     StructType,
160 |     IntegerType,
161 |     LongType,
162 |     DoubleType
163 | )
164 | 
165 | os.environ["AWS_PROFILE"] = '<your AWS profile name>'
166 | conf = SparkConf()
167 | conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider")
168 | conf.set("spark.jars", './spark_deps/postgresql-42.2.23.jar')
169 | 
170 | spark = SparkSession.builder.appName('repl') \
171 |         .config(conf=conf) \
172 |         .getOrCreate()
173 | 
174 | movie_schema = StructType([
175 |     StructField('movieId', IntegerType(), True),
176 |     StructField('title', StringType(), True),
177 |     StructField('genres', StringType(), True)
178 | ])
179 | 
180 | df = spark.createDataFrame([(1, 'Jumanji(1995)', 'Adventure | Children | Fantasy'),
181 |                                           (2, 'Heat (1995)', 'Action|Crime|Thriller')],
182 |                                          movie_schema)
183 | ```
184 | Get catalog information:
185 | ```python
186 | import boto3, json
187 | session = boto3.Session(profile_name='<your profile>', region_name='eu-central-1')
188 | glue = session.client('glue')
189 | s = json.dumps(glue.get_table(DatabaseName='test_db', Name='person'), indent=4, default=str)
190 | print(s)
191 | ```
192 | 


--------------------------------------------------------------------------------
/access/policy_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": "s3:GetObject",
 7 |             "Resource": "arn:aws:s3:::<data-lake-bucket>/<data-product-id>/<data-set-id>/*",
 8 |             "Condition": {
 9 |                 "StringEquals": {
10 |                     "s3:ExistingObjectTag/tag_name": "tag_value"
11 |                 }
12 |             }
13 |         }
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/builtin/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/builtin/ingest.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | from typing import List
 6 | import time
 7 | import datetime
 8 | from driver.task_executor import DataSet
 9 | from pyspark.sql.functions import lit, unix_timestamp
10 | from pyspark.sql import SparkSession
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def execute(inp_datasets: List[DataSet], spark_session: SparkSession, create_timestamp=False):
16 |     def resolve_data_set_id(ds: DataSet):
17 |         model_id_raw = None
18 |         if ds.model:
19 |             model_id_raw = ds.model.id
20 |         else:
21 |             model_id_raw = ds.id      
22 |         
23 |         id_tokens = model_id_raw.split('.')
24 | 
25 |         return id_tokens[len(id_tokens)-1] 
26 | 
27 |     logger.info(f'create timestamp: {create_timestamp}')
28 |     if create_timestamp:
29 |         timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
30 |         for ds in inp_datasets:
31 |             ds.df = ds.df.withColumn('ingest_date', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
32 |     return [DataSet(id=resolve_data_set_id(ds), df=ds.df) for ds in inp_datasets]
33 | 


--------------------------------------------------------------------------------
/docs/access-management.md:
--------------------------------------------------------------------------------
 1 | # Access management
 2 | 
 3 | The access management concept is based on two separate mechanisms:
 4 | 
 5 | 1. Tagging all produced data to control which groups should have access to data
 6 |     - This is controlled by the data producers, via the model YAML files
 7 |     - The data producers know their data best and can control which groups should have access (does it contain PII? Is
 8 |       it intended to be public or private, etc.)
 9 |     - the platform takes over this process and tags all produced data files based on the configuration in the YAML files
10 | 2. Managing groups of people (or services) who are allows to join those groups to gain access to the data.
11 |     - IAM policies, which provide access to S3 data files which have been tagged as mentioned before have to be created
12 |       manually (as of now)
13 |         - please see `access/policy_template.json` as an example for providing access to files which have specific tags
14 |           defined.
15 |     - those policies can be attached to IAM groups to provide access to one or multiple combinations of access control
16 |       tags
17 |     - IAM users then can join and leave groups to gain access to the data, matching the policies assigned to those
18 |       groups
19 | 
20 | ## Technical implementation
21 | 
22 | The S3 writer automatically applies the following tags to all data files written out to S3:
23 | 
24 | - tags defined in the `model.yml` under `models.<model>.tags` are added to all output data files in the dataset's S3
25 |   folder as is, using the tag's name and value without modification.
26 | - tags defined in the `model.yml` under `models.<model>.access` are added to all output data files in the dataset's S3
27 |   folder as well, but the tag names are prefixed with `access_`, to have a clear distinction between access control tags
28 |   and custom tags, every data producer can define without limitation.
29 |     - Example: the access tag `confidentiality` with value `private` will be assigned as S3 tag `access_confidentiality`
30 |       with value `private`.
31 | 
32 | ## Limitations
33 | 
34 | Based on the metadata defined in the model's YAML files, the processor will set S3 tags to all files written out to
35 | Amazon S3, found in the data dataset's "folder" (meaning all files, with the
36 | prefix `<data product id>/<output dataset id>/`)
37 | 
38 | Currently, only files written to S3 are supported to be tagged automatically.
39 | 
40 | Access policies and group have to be created by the user manually and IAM users have to be assigned to IAM groups
41 | manually to actually manage access to the data.


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # Architecture
2 | 
3 | ## Job processing chain
4 | 
5 | ![architectural diagram](./data-product-processor-arch.png)


--------------------------------------------------------------------------------
/docs/data-product-processor-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/docs/data-product-processor-arch.png


--------------------------------------------------------------------------------
/docs/how-to/custom-dependencies.md:
--------------------------------------------------------------------------------
 1 | # How to reference custom Spark dependencies?
 2 | 
 3 | Sometimes you might need custom third party libraries for your aggregation logic. These can be added by creating a
 4 | ```requirements.txt``` file in the root of your Data Product folder. In the following example we show, how to use
 5 | Pydeequ (a third party analyzer and quality assurance library from Amazon):
 6 | 
 7 | ```requirements.txt
 8 | pydeequ
 9 | ```
10 | 
11 | Pydeequ - in our example - is the python binding to the Deequ Scala implementation, that needs additional non-python (
12 | Scala or Java) libraries to be added to the Spark cluster.
13 | This can be added via a ```config.ini``` file (also stored in the root of the data product).
14 | 
15 | ```properties
16 | [spark jars]
17 | spark.jars.packages=com.amazon.deequ:deequ:1.2.2-spark-3.0
18 | spark.jars.excludes=net.sourceforge.f2j:arpack_combined_all
19 | ```
20 | 
21 | Once the pre-requisites are there, you can start using the new library in your custom logic:
22 | 
23 | ```python
24 | from pyspark.sql.functions import concat, col, lit
25 | from driver.common import find_dataset_by_id
26 | from driver.task_executor import DataSet
27 | from typing import List
28 | from pyspark.sql import SparkSession, Row
29 | from pydeequ.analyzers import *
30 | 
31 | 
32 | def execute(inp_dfs: List[DataSet], spark_session: SparkSession):
33 |     ds = find_dataset_by_id(inp_dfs, 'sample_product.sample_model')
34 |     ds.df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name')))
35 | 
36 |     analysis_result = AnalysisRunner(spark_session)
37 |     .onData(ds.df)
38 |     .addAnalyzer(Size())
39 |     .addAnalyzer(Completeness("b"))
40 |     .run()
41 | 
42 | 
43 | analysis_result_df = AnalyzerContext.successMetricsAsDataFrame(spark_session, analysis_result)
44 | 
45 | ds_model = DataSet(id='sample_model', df=ds.df)
46 | ds_analysis = DataSet(id='model_analysis', df=analysis_result_df)
47 | return [ds_model, ds_analysis]
48 | ```
49 | 
50 | Additionally you can create a custom initialisation file, called ```init_hook.py``` in the root folder of your data
51 | product. This file will give you control over the Spark environment and the data product processor environment as well.
52 | A feature that we can use to interact with the cluster configuration.
53 | 
54 | ```python
55 | from typing import List, Dict
56 | from pyspark import SparkConf
57 | from driver.task_executor import DataSet
58 | 
59 | 
60 | def enrich_spark_conf(conf: SparkConf) -> SparkConf:
61 |     conf.set("spark.sql.warehouse.dir", "some warehouse location")
62 |     return conf
63 | 
64 | 
65 | def add_pre_processors() -> List[callable]:
66 |     def my_custom_pre_processor(data_set: DataSet) -> DataSet:
67 |         return data_set.df.filter(...)
68 | 
69 |     return [my_custom_pre_processor]
70 | 
71 | 
72 | def add_post_processors() -> List[callable]:
73 |     def my_custom_post_processor(data_set: DataSet) -> DataSet:
74 |         return data_set.df.filter(...)
75 | 
76 |     return [my_custom_post_processor]
77 | ```
78 | 
79 | **Please note:** all of the above methods are optional. The Spark configuration can also be influenced by the use of the
80 | ini file.
81 | 
82 | #### Preparing your unit test to work with Pyspark custom configurations
83 | 
84 | Create a file ```pytest.ini``` and add Spark options:
85 | 
86 | ```properties
87 | [pytest]
88 | spark_options=
89 | spark.jars.packages=com.amazon.deequ:deequ:1.2.2-spark-3.0
90 | spark.jars.excludes=net.sourceforge.f2j:arpack_combined_all
91 | ```
92 | 


--------------------------------------------------------------------------------
/docs/how-to/local-development.md:
--------------------------------------------------------------------------------
  1 | # Setup of local development environment
  2 | 
  3 | > **Note**: The subsequent steps assume an installation on ___MacOS/OSX___
  4 | 
  5 | ## 1) Installation of tools and dependencies
  6 | 
  7 | ### Python
  8 | Everything will be installed in virtual environment in your local project folder.
  9 | 
 10 | ```bash
 11 | python3 -m venv .venv
 12 | source .venv/bin/activate
 13 | pip install -r requirements-test.txt
 14 | ```
 15 | 
 16 | ### Java
 17 | 
 18 | Install openjdk and maven.
 19 | 
 20 | ```bash
 21 | brew tap homebrew/cask-versions
 22 | brew update
 23 | brew tap  homebrew/cask
 24 | brew tap adoptopenjdk/openjdk
 25 | brew install --cask adoptopenjdk11
 26 | brew install maven
 27 | ```
 28 | 
 29 | ### Apache Spark
 30 | 
 31 | Install spark dependencies:
 32 | 
 33 | ```bash
 34 | mkdir spark_deps
 35 | cd spark_deps
 36 | wget https://jdbc.postgresql.org/download/postgresql-42.2.23.jar
 37 | ```
 38 | 
 39 | Install the AWS dependencies for Apache Hadoop:
 40 | 
 41 | 1. check the current version of hadoop: ```ll -al .venv/lib/python3.9/site-packages/pyspark/jars |grep hadoop```
 42 | 2. create a POM file in the spark_deps folder (make sure the version field matches the current hadoop version):
 43 | 
 44 | ```xml
 45 | <project>
 46 |   <modelVersion>4.0.0</modelVersion>
 47 |   <groupId>com.mycompany.app</groupId>
 48 |   <artifactId>my-app</artifactId>
 49 |   <version>1</version>
 50 |     <dependencies>
 51 |         <dependency>
 52 |             <groupId>org.apache.hadoop</groupId>
 53 |             <artifactId>hadoop-aws</artifactId>
 54 |             <version>3.3.1</version>
 55 |         </dependency>
 56 |     </dependencies>
 57 | </project>
 58 | ```
 59 | 
 60 | Then, run:
 61 | 
 62 | ```bash
 63 | mvn --batch-mode -f ./pom.xml -DoutputDirectory=./jars dependency:copy-dependencies
 64 | mv jars/* .
 65 | ```
 66 | 
 67 | ## 2) Test the installation
 68 | 
 69 | > **Note:** Don't forget to switch the new virtual environment in your IDE too.
 70 | 
 71 | Install in the local environment
 72 | ```commandline
 73 | pip install -e .
 74 | ```
 75 | 
 76 | ### Local invocation of data-product-processor
 77 | 
 78 | To test if the data-product-processor can be executed correctly, follow the subsequent steps.
 79 | 
 80 | Alternatively you can run the whole solution from the command line:
 81 | 
 82 | ```commandline
 83 | data-product-processor \
 84 |     --JOB_NAME "TEST" \
 85 |     --product_path /tests/assets/integration \
 86 |     --default_data_lake_bucket <SOME_DATA_LAKE_BUCKEY> \
 87 |     --aws_profile <your-aws-account-profile> \
 88 |     --aws_region <your-region>
 89 | ```
 90 | 
 91 | Optionally you might need to export Spark Home if the Spark environment is not found in your installation.
 92 | 
 93 | ```commandline
 94 | export SPARK_HOME="$(pwd)/.venv/lib/python3.9/site-packages/pyspark"
 95 | ```
 96 | 
 97 | Run the tests from command line (while the virtual environment is activated):
 98 | 
 99 | ```commandline
100 | pytest
101 | ```
102 | 
103 | ### Package creation
104 | 
105 | Test whether the python package (wheel) can be build through which the data-product-processor is distributed.
106 | 
107 | ```commandline
108 | pip install -U pip wheel setuptools
109 | python3 setup.py bdist_wheel
110 | ```
111 | 
112 | 
113 | # Troubleshooting / common errors
114 | 
115 | ## py4j
116 | 
117 | ```
118 | py4j.protocol.Py4JError: org.apache.spark.api.python.PythonUtils.getPythonAuthSocketTimeout does not exist in the JVM
119 | ```
120 | Resolve through:
121 | ```commandline
122 | export PYTHONPATH="${SPARK_HOME}/python;${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip;${PYTHONPATH}"
123 | ```
124 | 
125 | ## Sfl4j not found
126 | 
127 | ```commandline
128 | [NOT FOUND  ] org.slf4j#slf4j-api;1.7.5!slf4j-api.jar
129 | ```
130 | **Solution**
131 | Remove dir in .ivy2/cache, ivy2/jars and .m2/repository
132 | 


--------------------------------------------------------------------------------
/docs/how-to/transformation-logic.md:
--------------------------------------------------------------------------------
  1 | # How to write and test custom aggregation logic?
  2 | 
  3 | Each custom aggregation logic has the same anatomy: it receives a list of input DataSets (that contains the Spark
  4 | DataFrame)
  5 | and must produce at least one output DataSet with a Spark DataFrame inside. Everything in between is standard Python and
  6 | PySpark.
  7 | 
  8 | The example below receives one DataSet with the ID ```person_raw```, adds a new timestamp column if
  9 | the ```create_timestamp```
 10 | property was defined in the ```product.yml```'s pipeline > tasks > logic > parameters section and concatenates the
 11 | first_name and last_names columns into a full_name column. The very same DataFrame is packaged into two different
 12 | DataSets, with two different models referred to in the id property, so that the processor can do some post-processing on
 13 | the dataframes, that are defined in those models.
 14 | 
 15 | ```python
 16 | def execute(inp_dfs: List[DataSet], spark_session: SparkSession, create_timestamp=False):
 17 |     ds = find_dataset_by_id(inp_dfs, 'person_raw')
 18 | 
 19 |     if create_timestamp:
 20 |         timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
 21 |         ds.df = ds.df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
 22 | 
 23 |     df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name')))
 24 | 
 25 |     ds_pub = DataSet(id='person_pub', df=df)
 26 |     ds_pii = DataSet(id='person_pii', df=df)
 27 | 
 28 |     return [ds_pub, ds_pii]
 29 | ```
 30 | 
 31 | In the example above, it is mandatory to provide the ```inp_dfs``` and the ```spark_session``` parameters, because these
 32 | are injected by the task executor.
 33 | 
 34 | The DataSet class provides access to the Spark Data Frame, as well to the model and the product metadata structure.
 35 | 
 36 | ```python
 37 | @dataclass
 38 | class DataSet:
 39 |     id: str
 40 |     df: DataFrame
 41 |     model: SimpleNamespace = None
 42 |     product: DataProduct = None
 43 | ```
 44 | 
 45 | These can be referenced in each custom aggregation task code.
 46 | 
 47 | Your custom aggregation logic is parametrised from the ```product.yml``` file's ```tasks``` section:
 48 | 
 49 | ```yaml
 50 |   logic:
 51 |     module: tasks.custom_business_logic
 52 |     parameters:
 53 |       create_timestamp: false
 54 | ```
 55 | 
 56 | ## Testing
 57 | 
 58 | We recommend using the ```pytest``` framework for writing unit tests for your custom logic.
 59 | 
 60 | ### 1) Create a virtual environment in root folder
 61 | 
 62 | ```commandline
 63 | python3 -m venv .venv
 64 | source .venv/bin/activate
 65 | ```
 66 | 
 67 | ### 2) Install data-product-processor
 68 | ```commandline
 69 | pip install data-product-processor
 70 | ```
 71 | 
 72 | ### 3) Install python dependencies for test execution
 73 | 
 74 | Create a ```requirements-test.txt``` file in the root folder of the data product with the following content:
 75 | 
 76 | ```text
 77 | pyspark
 78 | pyspark-stubs
 79 | pytest-spark
 80 | pytest-mock
 81 | pytest-helpers-namespace
 82 | pytest-env
 83 | pytest-cov
 84 | pytest
 85 | ```
 86 | 
 87 | Install them.
 88 | ```commandline
 89 | pip install -r requirements-test.txt
 90 | ```
 91 | 
 92 | ### 4) Add tests
 93 | 
 94 | Create a ```tests``` folder in your data product folder.
 95 | 
 96 | ```commandline
 97 | mkdir tests
 98 | touch tests/__init__.py
 99 | ```
100 | Create a test configuration file called ```test_config.py```
101 | with [fixtures](https://docs.pytest.org/en/6.2.x/fixture.html) (reusable, support functionality injected into your tests
102 | by the pytest framework).
103 | 
104 | ```python
105 | 
106 | from types import SimpleNamespace
107 | from pyspark.sql import DataFrame
108 | from pytest import fixture
109 | from pyspark.sql.types import (
110 |     StringType,
111 |     StructField,
112 |     StructType,
113 |     IntegerType
114 | )
115 | 
116 | DEFAULT_BUCKET = 's3://test-bucket'
117 | 
118 | 
119 | @fixture
120 | def app_args() -> SimpleNamespace:
121 |     args = SimpleNamespace()
122 |     setattr(args, 'default_data_lake_bucket', DEFAULT_BUCKET)
123 |     return args
124 | 
125 | 
126 | @fixture(scope='module')
127 | def person_schema() -> StructType:
128 |     return StructType([
129 |         StructField('id', IntegerType(), False),
130 |         StructField('first_name', StringType(), True),
131 |         StructField('last_name', StringType(), True),
132 |         StructField('age', IntegerType(), True),
133 |         StructField('city', StringType(), True),
134 |         StructField('gender', StringType(), True),
135 |     ])
136 | 
137 | 
138 | @fixture(scope='module')
139 | def person_df(spark_session, person_schema) -> DataFrame:
140 |     return spark_session.createDataFrame([(1, "John", "Doe", 25, "Berlin", "male"),
141 |                                           (2, "Jane", "Doe", 41, "Berlin", "female"),
142 |                                           (3, "Maxx", "Mustermann", 30, "Berlin", "male")
143 |                                           ], person_schema)
144 | ```
145 | 
146 | Next write your test function for your custom business logic in the ```test_custom_business_logic.py``` file:
147 | 
148 | ```python
149 | from pyspark.sql import DataFrame
150 | 
151 | 
152 | def test_custom_logic(spark_session, person_df: DataFrame):
153 |     data_source = DataSet(id='some_schema.some_table', df=person_df)
154 |     results: List[DataSet] = tasks.custom_business_logic.execute([data_source], spark_session)
155 |     for dataset in results:
156 |         assert dataset.id == 'transformed_data_set'
157 |         assert dataset.df.count() == person_df.count()
158 |         dataset.df.show()
159 |         dataset.df.describe()
160 | ```
161 | 
162 | You might want to run an end-to-end test, by wiring together the minimal structure of the data product processor:
163 | 
164 | ```python
165 | from types import SimpleNamespace
166 | from driver import DataSet
167 | from driver.processors import schema_checker, constraint_processor, transformer_processor
168 | 
169 | 
170 | def test_end_to_end(spark_session, spark_context, person_df: DataFrame, app_args):
171 |     product_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
172 | 
173 |     def mock_input_handler(input_definition: SimpleNamespace):
174 |         dfs = {"source_id": person_df}
175 |         return dfs.get(input_definition.table)
176 | 
177 |     def mock_output_handler(dataset: DataSet):
178 |         assert dataset.id == 'transformed_data_set'
179 |         assert dataset.df.count() == person_df.count()
180 |         dataset.df.show()
181 |         dataset.df.describe()
182 | 
183 |     driver.init(spark_session)
184 |     driver.register_data_source_handler('connection', mock_input_handler)
185 |     driver.register_postprocessors(transformer_processor, schema_checker, constraint_processor)
186 |     driver.register_output_handler('default', mock_output_handler)
187 |     driver.register_output_handler('lake', mock_output_handler)
188 |     driver.process_product(app_args, product_folder)
189 | ```
190 | 
191 | You can run your tests from your favourite editor (eg. Pycharm) or using the ```pytest``` command line.
192 | 


--------------------------------------------------------------------------------
/driver/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from .driver import process_product, init, install_dependencies
 5 | from .task_executor import (
 6 |     register_data_source_handler,
 7 |     register_preprocessors,
 8 |     register_postprocessors,
 9 |     register_output_handler,
10 |     register_transformer,
11 |     add_transformers
12 | )
13 | from .core import (
14 |     DataSet,
15 |     ConfigContainer
16 | )
17 | from .common import read_csv, write_csv
18 | 
19 | 


--------------------------------------------------------------------------------
/driver/aws/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/driver/aws/datalake_api.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import os
 6 | from typing import List, Dict
 7 | 
 8 | from driver.aws import providers
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class Partition:
14 |     def __init__(self, path_key):
15 |         path_iterator = iter(os.path.split(path_key))
16 |         segments = next(pe for pe in path_iterator if pe).split('=')
17 |         self.name = segments[0]
18 |         self.value = segments[1]
19 |         sub_partition = next(path_iterator, None)
20 |         self.subpartitions = list()
21 |         if sub_partition:
22 |             o = Partition(sub_partition)
23 |             self.subpartitions.append(o)
24 | 
25 |     def get_partition_chain(self, prefix: str, parent_key: str = None, parent_value: str = None) -> List[Dict[str, str]]:
26 |         pchain = list()
27 |         prepped_prefix = os.path.join(prefix, f'{self.name}={self.value}')
28 |         pkeys = list()
29 |         pkey_values = list()
30 |         if parent_key and parent_value:
31 |             pkeys.append(parent_key)
32 |             pkey_values.append(parent_value)
33 |         if len(self.subpartitions) > 0:
34 |             for sp in self.subpartitions:
35 |                 pchain.extend(sp.get_partition_chain(prepped_prefix, parent_key=self.name, parent_value=self.value))
36 |         else:
37 |             pkeys.append(self.name)
38 |             pkey_values.append(self.value)
39 |             pchain.append({'keys': pkeys, 'values': pkey_values, 'location': prepped_prefix})
40 |         return pchain
41 | 
42 | 
43 | def read_partitions(bucket: str, container_folder: str = None):
44 |     s3 = providers.get_s3()
45 |     rsp = s3.list_objects_v2(Bucket=bucket, Prefix=os.path.join(container_folder, ''))
46 |     keys = set(os.path.dirname(k.get('Key')) for k in rsp.get('Contents'))
47 |     prefix = rsp.get('Prefix')
48 |     partition_keys = [p[len(prefix):] for p in keys if p != prefix.rstrip('/')]
49 |     partitions = list()
50 |     for p in partition_keys:
51 |         partitions.append(Partition(p))
52 |     return partitions
53 | 
54 | 
55 | def tag_files(bucket: str, prefix: str, tags: dict):
56 |     s3 = providers.get_s3()
57 | 
58 |     tags_s3 = []
59 |     for tag_name in tags.keys():
60 |         tags_s3.append({'Key': tag_name, 'Value': str(tags[tag_name])})
61 | 
62 |     for key in find_files(bucket, prefix):
63 |         s3.put_object_tagging(Bucket=bucket, Key=key, Tagging={'TagSet': tags_s3})
64 | 
65 | 
66 | def find_files(bucket: str, prefix: str) -> List[str]:
67 |     s3 = providers.get_s3()
68 |     files = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
69 |     return [f['Key'] for f in files['Contents']]
70 | 


--------------------------------------------------------------------------------
/driver/aws/glue_api.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | import botocore
 7 | from mypy_boto3_glue.type_defs import GetDatabasesResponseTypeDef, DatabaseTypeDef, GetTablesResponseTypeDef, \
 8 |     TableTypeDef, TableInputTypeDef, StorageDescriptorTypeDef, ColumnTypeDef, DatabaseInputTypeDef
 9 | from mypy_boto3_glue.client import Exceptions
10 | from driver.aws import providers
11 | from driver.aws.resolvers import resolve_table_input, resolve_partition_inputs, resolve_database
12 | from driver.task_executor import DataSet
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def drain_data_catalog(data_catalog_id: str):
18 |     glue = providers.get_glue()
19 |     try:
20 |         get_tables_response: GetTablesResponseTypeDef = glue.get_tables(DatabaseName=data_catalog_id)
21 |         for table in get_tables_response.get('TableList'):
22 |             glue.delete_table(DatabaseName=data_catalog_id, Name=table.get('Name'))
23 |     except Exception as enf:
24 |         if enf.__class__.__name__ == 'EntityNotFoundException':
25 |             logger.warning(
26 |                 f'Database {data_catalog_id} does not exists in the data catalog. No tables will be deleted.')
27 | 
28 | 
29 | def update_data_catalog(ds: DataSet):
30 |     glue = providers.get_glue()
31 |     logger.info(f'--> Updating the data catalog for data product [{ds.product_id}] and model [{ds.model.id}].')
32 | 
33 |     def upsert_database():
34 |         try:
35 |             rsp: GetDatabasesResponseTypeDef = glue.get_database(Name=ds.product_id)
36 |             # todo: update database with changes
37 |         except Exception as enf:
38 |             if enf.__class__.__name__ == 'EntityNotFoundException':
39 |                 # database does not exists yet
40 |                 logger.warning(
41 |                     f'Database {ds.product_id} does not exists in the data catalog ({str(enf)}). It is going to be created.')
42 |                 # todo: add permissions
43 |                 glue.create_database(
44 |                     DatabaseInput=resolve_database(ds))
45 |             else:
46 |                 raise enf
47 | 
48 |     def upsert_table():
49 |         try:
50 |             rsp: GetTablesResponseTypeDef = glue.get_table(DatabaseName=ds.product_id, Name=ds.id)
51 |             # todo: update table
52 |             glue.delete_table(DatabaseName=ds.product_id, Name=ds.id)
53 |             glue.create_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds))
54 |             # glue.update_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds))
55 |         except Exception as enf:  # EntityNotFoundException
56 |             # table not found
57 |             if enf.__class__.__name__ == 'EntityNotFoundException':
58 |                 logger.warning(
59 |                     f'Table [{ds.id}] cannot be found in the catalog schmea [{ds.product_id}]. Table is going to be created.')
60 |                 glue.create_table(DatabaseName=ds.product_id, TableInput=resolve_table_input(ds))
61 |             else:
62 |                 raise enf
63 |         # rsp: GetTablesResponseTypeDef = glue.get_table(DatabaseName=ds.product_id, Name=ds.id)
64 |         # todo: update partitions
65 |         # todo: register with lakeformation
66 | 
67 |     def upsert_partitions():
68 |         # entries = resolve_partition_entries(ds)
69 |         # rsp = glue.batch_update_partition(DatabaseName=ds.product_id, TableName=ds.model_id, Entries=entries)
70 |         partition_inputs = resolve_partition_inputs(ds)
71 |         if not partition_inputs:
72 |             return
73 |         rsp = glue.batch_create_partition(DatabaseName=ds.product_id, TableName=ds.id,
74 |                                           PartitionInputList=partition_inputs)
75 |         # rsp = glue.batch_update_partition(DatabaseName=ds.product_id, TableName=ds.id,
76 |         #                                   Entries=partition_inputs)
77 |         if rsp.get('Errors'):
78 |             raise Exception(f"Couldn't update the table [{ds.id}] with the partitions.")
79 |         status_code = rsp.get('ResponseMetadata').get('HTTPStatusCode')
80 |         logger.info(f'Partition upsert response with HTTP Status Code: {str(status_code)}')
81 |         # todo: write a proper error handling here
82 | 
83 |     upsert_database()
84 |     upsert_table()
85 |     upsert_partitions()  # todo: this is not yet an upsert (just in name but not in implementation)
86 | 


--------------------------------------------------------------------------------
/driver/aws/providers.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import logging
  5 | import traceback
  6 | import boto3
  7 | import mypy_boto3_glue
  8 | from driver.core import (
  9 |     Connection,
 10 |     ConnectionNotFoundException,
 11 |     DataProductTable,
 12 |     TableNotFoundException,
 13 | )
 14 | 
 15 | __SESSION__ = None
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | def init(
 19 |     key_id: str = None,
 20 |     key_material: str = None,
 21 |     profile: str = None,
 22 |     region: str = None,
 23 | ):
 24 |     global __SESSION__
 25 |     if key_id and key_material and region:
 26 |         __SESSION__ = boto3.Session(
 27 |             aws_access_key_id=key_id,
 28 |             aws_secret_access_key=key_material,
 29 |             region_name=region,
 30 |         )
 31 |     elif key_id and key_material and not region:
 32 |         __SESSION__ = boto3.Session(
 33 |             aws_access_key_id=key_id, aws_secret_access_key=key_material
 34 |         )
 35 |     elif profile and region:
 36 |         __SESSION__ = boto3.Session(profile_name=profile, region_name=region)
 37 |     elif profile and not region:
 38 |         __SESSION__ = boto3.Session(profile_name=profile)
 39 |     elif region:
 40 |         __SESSION__ = boto3.Session(region_name=region)
 41 |     else:
 42 |         __SESSION__ = boto3.Session()
 43 |     logger.debug(f'boto session region: {__SESSION__.region_name}')
 44 |     # amongst others used to verify bucket ownership in interaction with s3
 45 |     global __AWS_ACCOUNT_ID__
 46 |     sts = __SESSION__.client("sts")
 47 |     __AWS_ACCOUNT_ID__ = sts.get_caller_identity()["Account"]
 48 | 
 49 | def get_session() -> boto3.Session:
 50 |     return __SESSION__
 51 | 
 52 | 
 53 | def get_aws_account_id() -> str:
 54 |     if not __AWS_ACCOUNT_ID__:
 55 |         raise Exception("Boto session is not initialized. Please call init first.")
 56 |     return __AWS_ACCOUNT_ID__
 57 | 
 58 | 
 59 | def get_glue() -> mypy_boto3_glue.GlueClient:
 60 |     if not get_session():
 61 |         raise Exception("Boto session is not initialized. Please call init first.")
 62 |     return get_session().client("glue")
 63 | 
 64 | 
 65 | def get_s3():
 66 |     if not get_session():
 67 |         raise Exception("Boto session is not initialized. Please call init first.")
 68 | 
 69 |     return get_session().client("s3")
 70 | 
 71 | def describe_session():
 72 |     boto_session = get_session()
 73 |     return f'| Profile: {boto_session.profile_name} | Region: {boto_session.region_name} | Access Key: {boto_session.get_credentials().access_key}'
 74 | 
 75 | def connection_provider(connection_id: str) -> Connection:
 76 |     """
 77 |     Returns a data connection object, that can be used to connect to databases.
 78 |     :param connection_id:
 79 |     :return:
 80 |     """
 81 |     try:
 82 |         if not get_session():
 83 |             raise Exception("Boto session is not initialized. Please call init first.")
 84 |         glue = get_session().client("glue")
 85 |         response = glue.get_connection(Name=connection_id, HidePassword=False)
 86 |         if "Connection" not in response:
 87 |             logger.error(f'Connection {connection_id} not found. Boto session: {describe_session()}. Connection request response: {response}')
 88 |             raise ConnectionNotFoundException(
 89 |                 f"Connection [{connection_id}] could not be found."
 90 |             )
 91 |         cprops = response.get("Connection").get("ConnectionProperties")
 92 |         logger.debug(f'Connection details: {response.get("Connection")}')
 93 |         native_host = cprops.get("JDBC_CONNECTION_URL")[len("jdbc:") :]
 94 |         logger.debug(f'native host definition: {native_host}')
 95 |         connection = Connection.parse_obj(
 96 |             {
 97 |                 "name": connection_id,
 98 |                 "host": native_host,
 99 |                 "principal": cprops.get("USERNAME"),
100 |                 "credential": cprops.get("PASSWORD"),
101 |                 "type": native_host.split(":")[0],
102 |                 "ssl": cprops.get("JDBC_ENFORCE_SSL"),
103 |             }
104 |         )
105 |         return connection
106 |     except Exception as e:
107 |         logger.error(f'{type(e).__name__} exception received while retrieving the connection to the data source: {str(e)}). Boto session {describe_session()}.')
108 |         logger.debug(f'Exception log: {traceback.format_exc()}')
109 |         raise ConnectionNotFoundException(
110 |             f"Connection [{connection_id}] could not be found. {str(e)}. Make sure you have the right region defined."
111 |         )
112 | 
113 | 
114 | def datalake_provider(product_id, table_id) -> DataProductTable:
115 |     if not get_session():
116 |         raise Exception("Boto session is not initialized. Please call init first.")
117 |     glue = get_session().client("glue")
118 |     response = glue.get_table(DatabaseName=product_id, Name=table_id)
119 |     if "Table" not in response:
120 |         raise TableNotFoundException(
121 |             f"Data Product Table [{product_id}.{table_id}] could not be found."
122 |         )
123 |     table = DataProductTable.parse_obj(
124 |         {
125 |             "product_id": product_id,
126 |             "table_id": table_id,
127 |             "storage_location": response.get("Table")
128 |             .get("StorageDescriptor")
129 |             .get("Location"),
130 |         }
131 |     )
132 |     return table
133 | 


--------------------------------------------------------------------------------
/driver/aws/resolvers.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import os
  5 | from typing import List, Dict
  6 | from mypy_boto3_glue.type_defs import TableTypeDef, StorageDescriptorTypeDef, ColumnTypeDef, SerDeInfoTypeDef, \
  7 |     BatchUpdatePartitionRequestEntryTypeDef, PartitionInputTypeDef, TableInputTypeDef, DatabaseInputTypeDef
  8 | from pyspark.sql import DataFrame
  9 | 
 10 | from driver.aws.datalake_api import Partition
 11 | from driver.aws import datalake_api
 12 | from driver.task_executor import DataSet
 13 | from driver.util import filter_list_by_id, safe_get_property
 14 | 
 15 | 
 16 | def resolve_partitions(ds: DataSet) -> List[ColumnTypeDef]:
 17 |     return [ColumnTypeDef(Name=p, Type=dict(ds.df.dtypes)[p]) for p in ds.partitions]
 18 | 
 19 | 
 20 | def resolve_table_type(ds: DataSet) -> str:
 21 |     return 'EXTERNAL_TABLE'
 22 | 
 23 | 
 24 | def resolve_table_parameters(ds: DataSet) -> Dict[str, str]:
 25 |     return {
 26 |         "classification": "parquet",
 27 |         "compressionType": "none",
 28 |         "objectCount": "1",
 29 |         "recordCount": str(ds.df.count()),
 30 |         "typeOfData": "file"
 31 |     }
 32 | 
 33 | 
 34 | def resolve_input_format(ds: DataSet) -> str:
 35 |     formats = {
 36 |         'parquet': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
 37 |     }
 38 |     return formats.get(ds.storage_format)
 39 | 
 40 | 
 41 | def resolve_output_format(ds: DataSet) -> str:
 42 |     formats = {
 43 |         'parquet': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
 44 |     }
 45 |     return formats.get(ds.storage_format)
 46 | 
 47 | 
 48 | def resolve_compressed(ds: DataSet) -> bool:
 49 |     # return str(False).lower()
 50 |     return False
 51 | 
 52 | 
 53 | def resolve_serde_info(ds: DataSet) -> SerDeInfoTypeDef:
 54 |     parquet = SerDeInfoTypeDef(SerializationLibrary='org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe',
 55 |                                Parameters={'serialization.format': '1'})
 56 |     serdes = {
 57 |         'parquet': parquet
 58 |     }
 59 |     return serdes.get(ds.storage_format)
 60 | 
 61 | 
 62 | def resolve_storage_descriptor(ds: DataSet, override_location: str = None) -> StorageDescriptorTypeDef:
 63 |     if override_location:
 64 |         path = f's3://{os.path.join(override_location, "")}'
 65 |     else:
 66 |         path = f"s3://{ds.dataset_storage_path.lstrip('/')}"
 67 |     return StorageDescriptorTypeDef(
 68 |         Location=path,
 69 |         InputFormat=resolve_input_format(ds),
 70 |         OutputFormat=resolve_output_format(ds),
 71 |         Compressed=resolve_compressed(ds),
 72 |         NumberOfBuckets=-1,  # todo: check how to calculate this.
 73 |         SerdeInfo=resolve_serde_info(ds),
 74 |         Parameters=resolve_table_parameters(ds),  # todo: partition size
 75 |         Columns=resolve_columns(ds)
 76 |     )
 77 | 
 78 | 
 79 | def resolve_columns(ds: DataSet) -> List[ColumnTypeDef]:
 80 |     def lookup(column_name):
 81 |         if not hasattr(ds.model, 'columns'):
 82 |             return str()
 83 |         model_column = filter_list_by_id(ds.model.columns, column_name)
 84 |         if hasattr(model_column, 'name'):
 85 |             return f"{safe_get_property(model_column, 'name')}: {safe_get_property(model_column, 'description')}"
 86 |         else:
 87 |             return str()
 88 | 
 89 |     return [ColumnTypeDef(Name=cn, Type=ct, Comment=lookup(cn)) for cn, ct in ds.df.dtypes if cn not in ds.partitions]
 90 | 
 91 | 
 92 | def resolve_table(ds: DataSet) -> TableTypeDef:
 93 |     return TableTypeDef(
 94 |         Name=ds.model_name,
 95 |         DatabaseName=ds.product_id,
 96 |         Description=ds.model_description,
 97 |         Owner=ds.product_owner,
 98 |         PartitionKeys=resolve_partitions(ds),
 99 |         TableType=resolve_table_type(ds),
100 |         Parameters=resolve_table_parameters(ds),
101 |         StorageDescriptor=resolve_storage_descriptor(ds)
102 |     )
103 | 
104 | 
105 | def resolve_table_input(ds: DataSet) -> TableInputTypeDef:
106 |     return TableInputTypeDef(
107 |         Name=ds.id,
108 |         Description=f'{ds.model_name}: {ds.model_description}',
109 |         Owner=ds.product_owner or str(),
110 |         PartitionKeys=resolve_partitions(ds),
111 |         TableType='EXTERNAL_TABLE',
112 |         Parameters=resolve_table_parameters(ds),
113 |         StorageDescriptor=resolve_storage_descriptor(ds)
114 |     )
115 | 
116 | 
117 | def resolve_partition_input(partition_location: str, partition_values: list, ds: DataSet) -> PartitionInputTypeDef:
118 |     return PartitionInputTypeDef(
119 |         Values=partition_values,
120 |         StorageDescriptor=resolve_storage_descriptor(ds, override_location=partition_location),
121 |         Parameters=resolve_table_parameters(ds),
122 |     )
123 | 
124 | 
125 | def reshuffle_partitions(prefix: str, partitions: List[Partition]) -> dict:
126 |     partition_list = list()
127 |     partition_dict = dict()
128 |     for po in partitions:
129 |         partition_list.extend(po.get_partition_chain(prefix=prefix))
130 |     for pdict in partition_list:
131 |         # if pdict.get('location') not in ['glue-job-test-destination-bucket/person/gender=Female',
132 |         #                                  'glue-job-test-destination-bucket/person/gender=Male']:
133 |         #     #todo: remove this ugly hack
134 |         partition_dict[pdict.get('location')] = {
135 |             'keys': pdict.get('keys'),
136 |             'values': pdict.get('values')
137 |         }
138 |     return partition_dict
139 | 
140 | 
141 | def resolve_partition_inputs(ds: DataSet, format_for_update: bool = False) -> List[PartitionInputTypeDef]:
142 |     bucket = ds.storage_location.lstrip('/').split('/')[0]
143 |     folder = '/'.join(ds.dataset_storage_path.lstrip('/').split('/')[1:])
144 |     ps: List[Partition] = datalake_api.read_partitions(bucket=bucket, container_folder=folder)
145 |     pdict = reshuffle_partitions(os.path.join(bucket, folder), ps)
146 |     partition_defs = list()
147 |     for k, v in pdict.items():
148 |         partition_values = v.get('values')
149 |         if format_for_update:
150 |             entry = {'PartitionValueList': v.get('values'),
151 |                      'PartitionInput': resolve_partition_input(partition_location=k, partition_values=partition_values,
152 |                                                                ds=ds)}
153 |             partition_defs.append(entry)
154 |         else:
155 |             partition_defs.append(
156 |                 resolve_partition_input(partition_location=k, partition_values=partition_values, ds=ds))
157 |     return partition_defs
158 | 
159 | 
160 | def resolve_partition_entries(ds: DataSet) -> List[BatchUpdatePartitionRequestEntryTypeDef]:
161 |     partition_defs = list()
162 |     bucket = ds.storage_location.lstrip('/').split('/')[0]
163 |     folder = '/'.join(ds.dataset_storage_path.lstrip('/').split('/')[1:])
164 |     ps: List[Partition] = datalake_api.read_partitions(bucket=bucket, container_folder=folder)
165 |     pdict = reshuffle_partitions(bucket, ps)
166 |     for k, v in pdict.items():
167 |         partition_defs.append(BatchUpdatePartitionRequestEntryTypeDef(
168 |             PartitionValueList=v.get('values'),
169 |             PartitionInput=resolve_partition_input(partition_location=k, partition_values=v.get('values'), ds=ds)
170 |         ))
171 |     return partition_defs
172 | 
173 | 
174 | def resolve_database(ds: DataSet) -> DatabaseInputTypeDef:
175 |     return DatabaseInputTypeDef(Name=ds.product_id, Description=ds.product.description or str())
176 | 


--------------------------------------------------------------------------------
/driver/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import List
 5 | 
 6 | from pyspark.sql import DataFrame
 7 | from pyspark.sql.types import StructType
 8 | from driver import driver
 9 | from driver.task_executor import DataSet
10 | 
11 | 
12 | def find_dataset_by_id(dss: List[DataSet], dataset_id):
13 |     return next(iter([ds for ds in dss if ds.id == dataset_id]), None)
14 | 
15 | 
16 | def remap_schema(ds: DataFrame) -> List[StructType]:
17 |     schema_fields = list()
18 |     for col in ds.model.columns:
19 |         if hasattr(col, 'transform') and 'skip' in [t.type for t in col.transform]:
20 |             continue
21 |         nullable = True
22 |         if hasattr(col, 'constraints'):
23 |             nullable = 'not_null' not in [c.type for c in col.constraints]
24 |         schema_fields.append({'metadata': {}, 'name': col.id, 'type': col.type, 'nullable': nullable})
25 |     return StructType.fromJson({'fields': schema_fields, 'type': 'struct'})
26 | 
27 | 
28 | def read_csv(path: str) -> DataFrame:
29 |     return (
30 |         driver.get_spark().read
31 |             .format("csv")
32 |             .option("mode", "DROPMALFORMED")
33 |             .option("header", "true")
34 |             .load(path))
35 | 
36 | 
37 | def write_csv(df: DataFrame, output_path: str, buckets=3) -> None:
38 |     df.coalesce(buckets).write.format("csv").mode("overwrite").options(header="true").save(
39 |         path=output_path)
40 | 


--------------------------------------------------------------------------------
/driver/core.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | from urllib.parse import urlparse
  5 | from botocore.client import logger
  6 | from jsonschema import validate, ValidationError
  7 | import os
  8 | from types import SimpleNamespace
  9 | from dataclasses import dataclass
 10 | from pyspark.sql import DataFrame
 11 | from enum import Enum
 12 | from pydantic import (
 13 |     BaseModel,
 14 |     AnyUrl,
 15 |     SecretStr,
 16 |     conint,
 17 |     validator, root_validator, parse_obj_as, ValidationError, error_wrappers, Field)
 18 | from typing import Dict, List, Tuple, Any, TypeVar, Union
 19 | from pydantic import AnyUrl
 20 | from driver import util
 21 | 
 22 | Scalar = TypeVar('Scalar', int, float, bool, str)
 23 | 
 24 | class ConfigContainer(SimpleNamespace):
 25 |     def __init__(self, **kwargs):
 26 |         super().__init__(**kwargs)
 27 |         #  for key, value in dictionary.items():
 28 |             #  if isinstance(value, dict):
 29 |                 #  self.__setattr__(key, ConfigContainer(value))
 30 |             #  else:
 31 |                 #  self.__setattr__(key, value)
 32 | 
 33 |     def __getattribute__(self, value):
 34 |         try:
 35 |             return super().__getattribute__(value)
 36 |         except AttributeError:
 37 |             #  super().__setattr__(value, SimpleNamespace())
 38 |             return super().__getattribute__(value)
 39 | 
 40 | @dataclass
 41 | class DataProduct:
 42 |     id: str
 43 |     description: str = None
 44 |     owner: str = None
 45 | 
 46 | 
 47 | @dataclass
 48 | class DataSet:
 49 |     id: str
 50 |     df: DataFrame
 51 |     model: ConfigContainer = None
 52 |     product: DataProduct = None
 53 | 
 54 |     @classmethod
 55 |     def find_by_id(cls, dataset_list, ds_id):
 56 |         return next(iter([m for m in dataset_list if m.id == ds_id]), None)
 57 | 
 58 |     @property
 59 |     def partitions(self) -> List[str]:
 60 |         if self.storage_options and hasattr(self.storage_options, 'partition_by'):
 61 |             if isinstance(self.storage_options.partition_by, str):
 62 |                 return [self.storage_options.partition_by]
 63 |             else:
 64 |                 return [p for p in self.storage_options.partition_by]
 65 |         else:
 66 |             return list()
 67 | 
 68 |     @property
 69 |     def storage_location(self) -> (str | None):
 70 |         if util.check_property(self, 'model.storage.location'):
 71 |             return self.model.storage.location
 72 |         else:
 73 |             return None
 74 | 
 75 |     @storage_location.setter
 76 |     def storage_location(self, path: str):
 77 |         if not self.model:
 78 |             raise Exception("There's no model on the dataset, so location cannot be set yet.")
 79 |         elif not hasattr(self.model, 'storage'):
 80 |             storage = ConfigContainer()
 81 |             setattr(storage, 'location', path)
 82 |             setattr(self.model, 'storage', storage)
 83 |         elif not hasattr(self.model.storage, 'location'):
 84 |             setattr(self.model.storage, 'location', path)
 85 |         else:
 86 |             self.model.storage.location = path
 87 | 
 88 |     @property
 89 |     def path(self) -> str:
 90 |         if self.id is None:
 91 |             raise Exception(f'Can not construct data set path because product id is not defined.')
 92 |         if not self.storage_location:
 93 |             raise Exception(f'The data set storage location is not set for dataset id: {self.id}.')
 94 |         return f"{self.product.id}/{self.id}"
 95 | 
 96 |     @property
 97 |     def dataset_storage_path(self) -> str:
 98 |         return f'{self.storage_location}/{self.path}'
 99 | 
100 |     @property
101 |     def storage_type(self) -> str:
102 |         if self.model and hasattr(self.model, 'storage'):
103 |             return self.model.storage.type
104 |         else:
105 |             return 'default'
106 | 
107 |     @property
108 |     def storage_format(self) -> (str | None):
109 |         if self.model and hasattr(self.model, 'storage'):
110 |             return self.model.storage.format if hasattr(self.model.storage, 'format') else None
111 |         else:
112 |             return None
113 | 
114 |     @property
115 |     def storage_options(self) -> (ConfigContainer | None):
116 |         if self.model and hasattr(self.model, 'storage') and hasattr(self.model.storage, 'options'):
117 |             return self.model.storage.options
118 |         else:
119 |             return None
120 | 
121 |     @property
122 |     def product_id(self) -> (str | None):
123 |         return self.product.id if self.product else None
124 | 
125 |     @product_id.setter
126 |     def product_id(self, p_id: str) -> None:
127 |         if self.product:
128 |             self.product.id = p_id
129 |         else:
130 |             self.product = DataProduct(id=p_id)
131 | 
132 |     @property
133 |     def product_description(self) -> str:
134 |         return self.product.description if self.product else None
135 | 
136 |     @property
137 |     def product_owner(self) -> str:
138 |         return self.product.owner if self.product else None
139 | 
140 |     @property
141 |     def tags(self) -> dict:
142 |         if not hasattr(self, 'model') or not hasattr(self.model, 'tags'):
143 |             return dict()
144 |         if self.id is None:
145 |             raise Exception(f'Can not construct tags, id is not defined.')
146 |         return self.model.tags.__dict__
147 | 
148 |     @property
149 |     def access_tags(self) -> dict:
150 |         if not hasattr(self, 'model') or not hasattr(self.model, 'access'):
151 |             return dict()
152 |         if self.id is None:
153 |             raise Exception(f'Can not construct tags, id is not defined.')
154 |         return self.model.access.__dict__
155 | 
156 |     @property
157 |     def all_tags(self) -> dict:
158 |         if self.id is None:
159 |             raise Exception(f'Can not construct tags, id is not defined.')
160 |         return {**self.tags, **{'access_' + k: v for k, v in self.access_tags.items()}}
161 | 
162 |     @property
163 |     def model_name(self) -> str:
164 |         return self.model.name if hasattr(self, 'model') and hasattr(self.model, 'name') else self.id
165 | 
166 |     @property
167 |     def model_description(self) -> str:
168 |         return self.model.description if hasattr(self, 'model') and hasattr(self.model, 'description') else str()
169 | 
170 | 
171 | class SchemaValidationException(Exception):
172 |     def __init__(self, message: str, data_set: DataSet):
173 |         self.data_set = data_set
174 |         super().__init__(message)
175 | 
176 | 
177 | class ValidationException(Exception):
178 |     def __init__(self, message: str):
179 |         super().__init__(message)
180 | 
181 | 
182 | class ConnectionNotFoundException(Exception):
183 |     pass
184 | 
185 | 
186 | class TableNotFoundException(Exception):
187 |     pass
188 | 
189 | 
190 | class JobExecutionException(Exception):
191 |     pass
192 | 
193 | 
194 | class ProcessorChainExecutionException(Exception):
195 |     pass
196 | 
197 | 
198 | class ResolverException(Exception):
199 |     pass
200 | 
201 | 
202 | class LocationDsn(AnyUrl):
203 |     allowed_schemes = {'datastore', 'connection'}
204 |     user_required = False
205 | 
206 | 
207 | class PostgresDsn(AnyUrl):
208 |     allowed_schemes = {'postgres', 'postgresql'}
209 |     user_required = False
210 | 
211 | 
212 | class JdbcDsn(AnyUrl):
213 |     allowed_schemes = {'jdbc', 'jdbc'}
214 |     user_required = False
215 | 
216 | 
217 | class MysqlDsn(AnyUrl):
218 |     allowed_schemes = {'mysql', 'mysql'}
219 |     user_required = False
220 | 
221 | 
222 | class IOType(str, Enum):
223 |     model = 'model'
224 |     connection = 'connection'
225 |     file = 'file'
226 | 
227 | 
228 | class ArtefactType(str, Enum):
229 |     model = 'model'
230 |     product = 'product'
231 | 
232 | 
233 | class ConnectionType(str, Enum):
234 |     jdbc = 'jdbc'
235 |     postgresql = 'postgresql'
236 |     redshift = 'redshift'
237 |     mysql = 'mysql'
238 |     mariadb = 'mariadb'
239 |     mongodb = 'mongodb'
240 |     s3 = 's3'
241 |     csv = 'csv'
242 |     parquet = 'parquet'
243 | 
244 |     @classmethod
245 |     def is_file(cls, conn_type: 'ConnectionType'):
246 |         return conn_type in [ConnectionType.csv, ConnectionType.parquet, ConnectionType.s3]
247 | 
248 | 
249 | url_parsers = {
250 |     ConnectionType.postgresql: PostgresDsn,
251 |     ConnectionType.jdbc: JdbcDsn
252 | }
253 | 
254 | 
255 | class Connection(BaseModel):
256 |     name: str
257 |     principal: Union[str, None]
258 |     credential: Union[SecretStr, None]
259 |     host: str
260 |     port: Union[conint(lt=65535), None]
261 |     db_name: Union[str, None]
262 |     ssl: bool = False
263 |     type: ConnectionType
264 |     timeout: int = 3600
265 |     batch_size: int = 10000
266 |     meta_data: Dict[str, Scalar] = {}
267 | 
268 |     class Config:
269 |         validate_assignment = True
270 | 
271 |     @classmethod
272 |     def is_port_required(cls, conn_type: Union[ConnectionType, str]):
273 |         if isinstance(conn_type, str):
274 |             conn_type = ConnectionType(conn_type)
275 |         return not ConnectionType.is_file(conn_type)
276 | 
277 |     @classmethod
278 |     def is_jdbc_supported(cls, conn_type: Union[ConnectionType, str]):
279 |         return Connection.is_port_required(conn_type)
280 | 
281 |     @classmethod
282 |     def is_db_name_required(cls, conn_type: Union[ConnectionType, str]):
283 |         return Connection.is_port_required(conn_type)
284 | 
285 |     @classmethod
286 |     def is_userinfo_required(cls, conn_type: Union[ConnectionType, str]):
287 |         return Connection.is_port_required(conn_type)
288 | 
289 |     @classmethod
290 |     def fill_url_contained_values(cls, values: dict, ctype: Union[ConnectionType, str]):
291 |         def strip_path(string: str):
292 |             return string.strip('/')
293 | 
294 |         validable_keys = ['principal', 'credential', 'port', 'db_name']
295 |         autofill_checkers = {
296 |             'port': Connection.is_port_required,
297 |             'principal': Connection.is_userinfo_required,
298 |             'credential': Connection.is_userinfo_required,
299 |             'db_name': Connection.is_db_name_required,
300 |         }
301 |         url_property_map = {
302 |             'port': ('port', None),
303 |             'host': ('host', None),
304 |             'principal': ('user', None),
305 |             'credential': ('password', None),
306 |             'db_name': ('path', strip_path)
307 |         }
308 |         none_valued_keys = [k for k in values.keys() if not values.get(k)]
309 |         values_keys = set(list(values.keys()) + none_valued_keys)
310 |         vk = set(validable_keys)
311 |         missing_keys = vk.difference(values_keys)
312 |         parsable_keys = []
313 |         for k in missing_keys:
314 |             if autofill_checkers.get(k)(ctype):
315 |                 parsable_keys.append(k)
316 |             else:
317 |                 values[k] = None
318 |         if len(parsable_keys) == 0:
319 |             return
320 |         url_parser = url_parsers.get(ctype, AnyUrl)
321 |         try:
322 |             url: AnyUrl = parse_obj_as(url_parser, values.get('host'))
323 |             for pk in parsable_keys:
324 |                 func_name, converter = url_property_map.get(pk)
325 |                 value = getattr(url, func_name)
326 |                 if not value:
327 |                     raise ValueError(f'The field {pk} is required and not provided in the url or directly.')
328 |                 if converter:
329 |                     values[pk] = converter(value)
330 |                 else:
331 |                     values[pk] = value
332 |         except ValueError as verr:
333 |             raise verr
334 |         except TypeError as tep:
335 |             raise ValueError(
336 |                 f'Programming error at Connection Validation: {str(tep)}. '
337 |                 f'Function name for property to be invoked on URL of type {type(url)}: {func_name}')
338 |         except Exception as ex:
339 |             raise ValueError(
340 |                 f'When one of the following fields is missing {validable_keys}, '
341 |                 f'the $host URL must include its value; {str(ex)}')
342 | 
343 |     def get_native_connection_url(self, generate_creds=True) -> str:
344 |         url_parser = url_parsers.get(self.type, AnyUrl)
345 |         try:
346 |             url: AnyUrl = parse_obj_as(url_parser, self.host)
347 |             if Connection.is_userinfo_required(self.type):
348 |                 user = url.user or self.principal
349 |                 password = url.password or self.credential.get_secret_value()
350 |             if Connection.is_db_name_required(self.type):
351 |                 path = url.path or f'/{self.db_name}'
352 |             if Connection.is_port_required(self.type):
353 |                 port = url.port or self.port
354 |             if generate_creds:
355 |                 return AnyUrl.build(scheme=url.scheme, user=user, password=password, host=url.host, port=port,
356 |                                     path=path)
357 |             else:
358 |                 return AnyUrl.build(scheme=url.scheme, host=url.host, port=port, path=path)
359 |         except (error_wrappers.ValidationError, ValidationError):
360 |             # not a url format
361 |             passwd = self.credential.get_secret_value() if self.credential else ''
362 |             userinfo = f'{self.principal}:{passwd}@' if Connection.is_userinfo_required(self.type) else ''
363 |             host = self.host.strip('/') if self.host else ''
364 |             port = f':{self.port}' if Connection.is_port_required(self.type) else ''
365 |             db_path = f'/{self.db_name}' if Connection.is_db_name_required(self.type) else ''
366 |             return f'{str(self.type.value)}://{userinfo}{host}{port}{db_path}'
367 | 
368 |     def get_jdbc_connection_url(self, generate_creds=True) -> str:
369 |         if Connection.is_jdbc_supported(self.type):
370 |             return f'jdbc:{self.get_native_connection_url(generate_creds)}'
371 |         else:
372 |             raise AssertionError(f"The connection {self.type.value} doesn't support JDBC.")
373 | 
374 |     @root_validator(pre=True)
375 |     def check_host_url_dependent_fields(cls, values: dict):
376 |         connection_type = values.get('type')
377 |         host = values.get('host')
378 |         if not host or not connection_type:
379 |             raise ValueError('The host and the connection type must be defined.')
380 |         Connection.fill_url_contained_values(values, connection_type)
381 |         return values
382 | 
383 | 
384 | class DataProductTable(BaseModel):
385 |     product_id: str
386 |     table_id: str
387 |     storage_location: str
388 | 
389 |     @property
390 |     def storage_location_s3a(self):
391 |         return self.storage_location.replace('s3://', 's3a://')
392 | 
393 | 
394 | def resolve_data_set_id(io_def: ConfigContainer) -> str:
395 |     def xtract_domain(s):
396 |         if '.' in s:
397 |             domain_elements = s.rsplit('.')
398 |             return domain_elements[len(domain_elements) - 1]
399 |         else:
400 |             return s
401 | 
402 |     if io_def.type == IOType.model:
403 |         model_url = getattr(io_def, io_def.type)
404 |         return xtract_domain(model_url)
405 |     elif io_def.type == IOType.connection:
406 |         return xtract_domain(io_def.model) if hasattr(io_def, 'model') else xtract_domain(io_def.table)
407 |     elif io_def.type == IOType.file:
408 |         if hasattr(io_def, IOType.model.name):
409 |             return xtract_domain(getattr(io_def, IOType.model.name))
410 |         else:
411 |             parsed_file = urlparse(io_def.file)
412 |             filename = os.path.basename(parsed_file.path)
413 |             return filename.rsplit('.')[0]
414 |     else:
415 |         raise ConnectionNotFoundException(f'The IO Type {io_def.type} is not supported.')
416 | 
417 | 
418 | def resolve_data_product_id(io_def: ConfigContainer) -> str:
419 |     if io_def.type == IOType.model:
420 |         return getattr(io_def, io_def.type).rsplit('.')[0]
421 |     elif io_def.type == IOType.connection:
422 |         return getattr(io_def, 'table').rsplit('.')[0]
423 | 


--------------------------------------------------------------------------------
/driver/driver.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import sys, os
 6 | import traceback
 7 | 
 8 | from pyspark.sql import SparkSession
 9 | from driver import task_executor, packager
10 | from .packager import ziplib
11 | from .util import compile_models, compile_product
12 | 
13 | __SPARK__: SparkSession = None
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def get_spark() -> SparkSession:
18 |     if __SPARK__:
19 |         return __SPARK__
20 |     else:
21 |         raise RuntimeError('Spark Session is not created yet. Call init() first.')
22 | 
23 | 
24 | def get_or_create_session(config=None) -> SparkSession:  # pragma: no cover
25 |     """Build spark session for jobs running on cluster."""
26 |     spark = SparkSession.builder.appName(__name__) \
27 |         .config(conf=config) \
28 |         .enableHiveSupport() \
29 |         .getOrCreate()
30 | 
31 |     return spark
32 | 
33 | 
34 | def init(spark_session: SparkSession = None, spark_config=None):
35 |     global __SPARK__
36 |     if not spark_session:
37 |         logger.info('creating a new Spark session.')
38 |         __SPARK__ = get_or_create_session(spark_config)
39 |     else:
40 |         logger.info('returning already existing Spark session.')
41 |         __SPARK__ = spark_session
42 |     # sc  = __SPARK__.sparkContext
43 |     # sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
44 | 
45 | 
46 | def install_dependencies(product_path: str):
47 |     new_packages = packager.install_dependencies(product_path)
48 |     if new_packages:
49 |         logger.info(f'packaging up the following new dependencies {new_packages.keys()}')
50 |         for new_pack_name in new_packages.keys():
51 |             zipfile = ziplib(new_packages.get(new_pack_name), new_pack_name)
52 |             logger.info(f'-----> installing {zipfile}')
53 |             get_spark().sparkContext.addPyFile(zipfile)
54 |         logger.debug('=> Dependencies are installed.')
55 | 
56 | 
57 | def process_product(args, product_path: str):
58 |     try:
59 |         product = compile_product(product_path, args)
60 |         models = compile_models(product_path, product)
61 |         for task in product.pipeline.tasks:
62 |             task_executor.execute(product, task, models, product_path)
63 |     except Exception as e:
64 |         traceback.print_exc()
65 |         logger.error(f"Couldn't execute job due to >> {type(e).__name__}: {str(e)}")
66 |         sys.exit(-1)
67 | 


--------------------------------------------------------------------------------
/driver/io_handlers.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import logging
  5 | import os
  6 | from .core import ConfigContainer
  7 | from urllib.parse import urlparse
  8 | 
  9 | from pyspark.sql import DataFrame, DataFrameWriter
 10 | from driver.aws import glue_api, datalake_api
 11 | from driver.core import Connection, resolve_data_set_id, resolve_data_product_id
 12 | from driver.driver import get_spark
 13 | from driver.task_executor import DataSet
 14 | 
 15 | __CONN_PROVIDER__ = None
 16 | __DATA_PRODUCT_PROVIDER__ = None
 17 | 
 18 | from driver.util import check_property
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def init(connection_provider: callable, data_product_provider: callable):
 24 |     global __CONN_PROVIDER__, __DATA_PRODUCT_PROVIDER__
 25 |     __CONN_PROVIDER__ = connection_provider
 26 |     __DATA_PRODUCT_PROVIDER__ = data_product_provider
 27 | 
 28 | 
 29 | jdbc_drivers = {
 30 |     'postgresql': 'org.postgresql.Driver',
 31 |     'mysql': 'com.mysql.jdbc'
 32 | }
 33 | 
 34 | 
 35 | def connection_input_handler(props: ConfigContainer) -> DataFrame:
 36 |     connection: Connection = __CONN_PROVIDER__(props.connection)
 37 |     logger.info(f'using input conection: {connection.get_jdbc_connection_url(generate_creds=False)}')
 38 |     jdbcDF = (
 39 |         get_spark()
 40 |         .read.format("jdbc")
 41 |         .option("url", connection.get_jdbc_connection_url(generate_creds=False))
 42 |         .option("dbtable", props.table)
 43 |         .option("user", connection.principal)
 44 |         .option("password", connection.credential.get_secret_value())
 45 |         .option("driver", jdbc_drivers.get(connection.type.name))
 46 |         .option("ssl", connection.ssl)
 47 |         .option("sslmode", "require")
 48 |         .load()
 49 |     )
 50 |     return jdbcDF
 51 | 
 52 | 
 53 | def file_input_handler(props: ConfigContainer) -> DataFrame:
 54 |     def get_type():
 55 |         return props.options.type or 'parquet'
 56 | 
 57 |     def get_separator():
 58 |         return props.options.separator or ','
 59 | 
 60 |     def get_infer_schema():
 61 |         return props.options.infer_schema or 'false'
 62 | 
 63 |     def get_header():
 64 |         return props.options.header or 'true'
 65 | 
 66 |     parsed = urlparse(props.file)
 67 |     scheme = 's3a' if parsed.scheme == 's3' else parsed.scheme
 68 |     if parsed.scheme:
 69 |         location = f'{scheme}://{parsed.netloc}{parsed.path}'
 70 |     else:
 71 |         location = f'{parsed.path}'
 72 |     logger.info(f'-> [File Input Handler]: reading from {location}')
 73 |     if hasattr(props, 'options'):
 74 |         df = get_spark().read.load(location, format=get_type(), sep=get_separator(),
 75 |                                    inferSchema=get_infer_schema(), header=get_header())
 76 |     else:
 77 |         df = get_spark().read.load(location)
 78 |     return df
 79 | 
 80 | 
 81 | def lake_input_handler(io_def: ConfigContainer) -> DataFrame:
 82 |     prod_id = resolve_data_product_id(io_def)
 83 |     model_id = resolve_data_set_id(io_def)
 84 |     data_product_table = __DATA_PRODUCT_PROVIDER__(prod_id, model_id)
 85 |     df = get_spark().read.parquet(data_product_table.storage_location_s3a)
 86 |     return df
 87 | 
 88 | 
 89 | def file_output_handler(ds: DataSet, options: ConfigContainer):
 90 |     pass
 91 | 
 92 | 
 93 | def resolve_compression(ds: DataSet):
 94 |     # todo: parse this into an enum
 95 |     # none, uncompressed, snappy, gzip, lzo, brotli, lz4,
 96 |     if check_property(ds, 'model.storage.options.compression'):
 97 |         return ds.model.storage.options.compression
 98 |     else:
 99 |         return 'snappy'
100 | 
101 | 
102 | def resolve_coalesce(ds: DataSet):
103 |     if check_property(ds, 'model.storage.options.coalesce'):
104 |         return ds.model.storage.options.coalesce
105 |     else:
106 |         return 2
107 | 
108 | 
109 | def resolve_header(ds: DataSet):
110 |     if check_property(ds, 'model.storage.options.skip_first_row'):
111 |         return ds.model.storage.options.skip_first_row
112 |     else:
113 |         return 'true'
114 | 
115 | 
116 | def lake_output_handler(ds: DataSet):
117 |     output = f"{'s3a://'}{ds.dataset_storage_path.lstrip('/')}"
118 |     logging.info(f'-> [Lake Output Handler]: writing data product to: {output}')
119 |     ds.df.coalesce(resolve_coalesce(ds)).write \
120 |         .partitionBy(*ds.partitions or []) \
121 |         .format(ds.storage_format) \
122 |         .mode('overwrite') \
123 |         .option('header', resolve_header(ds)) \
124 |         .option('compression', resolve_compression(ds)) \
125 |         .save(output)
126 | 
127 |     datalake_api.tag_files(ds.storage_location, ds.path, ds.all_tags)
128 | 
129 |     # print(f'# partitions after write {ds.df.rdd.getNumPartitions()}')
130 |     # todo: recheck coalesce value
131 |     # todo: add parquet compression support / the glue catalog needs it too
132 |     # todo: add bucket support & also to the glue catalog
133 |     glue_api.update_data_catalog(ds)
134 | 


--------------------------------------------------------------------------------
/driver/packager.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import os, sys, zipfile, itertools
 5 | from typing import List, Dict
 6 | from pip._vendor import pkg_resources
 7 | 
 8 | 
 9 | def install_pip_package(packages: list):
10 |     from pip._internal.commands import create_command
11 |     install = create_command('install', isolated=False)
12 |     install.main(packages)
13 | 
14 | 
15 | def ziplib(dist_path, package_name) -> str:
16 |     libpath = os.path.dirname(os.path.join(dist_path, package_name))
17 |     zippath = f'{package_name}.zip'
18 |     zf = zipfile.PyZipFile(zippath, mode='w')
19 |     try:
20 |         zf.debug = 3
21 |         zf.writepy(libpath)
22 |         return zippath
23 |     finally:
24 |         zf.close()
25 | 
26 | 
27 | def install_dependencies(product_path: str) -> Dict:
28 |     """ Collects requirements from a requirements from the data product file,
29 |         installs the dependencies and returns a dictionary with all installed packages and their path.
30 |     """
31 |     def collect_packages() -> set:
32 |         ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries)
33 |         eks = ws.entry_keys
34 |         return set(itertools.chain(*[eks.get(k) for k in eks.keys()]))
35 | 
36 |     def find_path_for_package(package_name):
37 |         ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries)
38 |         eks = ws.entry_keys
39 |         return next(iter([path for path in eks.keys() if package_name in eks.get(path)]), None)
40 | 
41 |         #todo: review and remove the one below
42 | 
43 |         # def collect_deps(package_name: str):
44 |         #     def merge_reqs(package: pkg_resources.DistInfoDistribution):
45 |         #         return_set = set({package.project_name})
46 |         #         required_deps: List[pkg_resources.Requirement] = p.requires()
47 |         #         required_pnames = [r.project_name for r in required_deps]
48 |         #         for rpack in required_pnames:
49 |         #             return_set.update(merge_reqs(rpack))
50 |         #         return return_set
51 |         # ws = pkg_resources.WorkingSet(pkg_resources.working_set.entries)
52 |         # package: pkg_resources.DistInfoDistribution = ws.by_key[package_name]
53 |         # return merge_reqs(package)
54 | 
55 |     requirements = os.path.join(product_path, 'requirements.txt')
56 |     if os.path.isfile(requirements):
57 |         before = collect_packages()
58 |         with open(requirements) as f:
59 |             lines = [line.rstrip('\n') for line in f]
60 |             install_pip_package(lines)
61 |         after = collect_packages()
62 |         delta_packages = after - before
63 |         return_packs = dict()
64 |         for delta_pack in delta_packages:
65 |             return_packs[delta_pack] = find_path_for_package(delta_pack)
66 |         return return_packs
67 | 


--------------------------------------------------------------------------------
/driver/processors.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import hashlib
  5 | import logging
  6 | import re
  7 | from datetime import datetime, timedelta
  8 | from typing import List
  9 | 
 10 | from driver import common
 11 | from pyspark.sql import DataFrame, Window
 12 | from pyspark.sql.functions import col, lit, udf, hash, to_date, row_number
 13 | from pyspark.sql.types import StringType, StructField, TimestampType
 14 | from pyspark.ml.feature import Bucketizer
 15 | from driver.core import ValidationException, SchemaValidationException
 16 | from driver.task_executor import DataSet
 17 | 
 18 | from driver.util import check_property
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def null_validator(df: DataFrame, col_name: str, cfg: any = None):
 24 |     # null_value_ratio = df.select(count(when(col(col_name).isNull(), True)) / count(lit(1)).alias('count')) \
 25 |     #     .first()[0]
 26 |     # ('not_null', self.column, null_value_ratio <= self.threshold, self.threshold, null_value_ratio
 27 |     col = df.select(col_name)
 28 |     if col.filter((col[col_name].isNull()) | (col[col_name] == "")).count() > 0:
 29 |         raise ValidationException(f'Column: {col_name} is expected to be not null.')
 30 | 
 31 | 
 32 | def regexp_validator(df: DataFrame, col_name: str, cfg: any = None):
 33 |     if not hasattr(cfg, 'value'):
 34 |         raise ValidationException(f'Column {col_name} has regexp constraint validator, but no value option provided.')
 35 |     col = df.select(col_name)
 36 |     if col.count() != col.filter(col[col_name].rlike(cfg.value)).count():
 37 |         raise ValidationException(f"Column: [{col_name}] doesn't match regexp: {cfg.value}")
 38 | 
 39 | 
 40 | def unique_validator(df: DataFrame, col_name: str, cfg: any = None):
 41 |     col = df.select(col_name)
 42 |     if col.distinct().count() != col.count():
 43 |         raise ValidationException(f'Column: {col_name} is expected to be unique.')
 44 | 
 45 | 
 46 | def resolve_time_delta(cfg):
 47 |     if hasattr(cfg, 'time_unit'):
 48 |         if cfg.time_unit == 'minutes':
 49 |             return timedelta(minutes=cfg.threshold)
 50 |         elif cfg.time_unit == 'hours':
 51 |             return timedelta(hours=cfg.threshold)
 52 |         elif cfg.time_unit == 'days':
 53 |             return timedelta(days=cfg.threshold)
 54 |         elif cfg.time_unit == 'weeks':
 55 |             return timedelta(weeks=cfg.threshold)
 56 |         elif cfg.time_unit == 'seconds':
 57 |             return timedelta(seconds=cfg.threshold)
 58 |     else:
 59 |         return timedelta(minutes=cfg.threshold)
 60 | 
 61 | 
 62 | def past_validator(df: DataFrame, col_name: str, cfg: any = None):
 63 |     now = datetime.now()
 64 |     if cfg and hasattr(cfg, 'threshold'):
 65 |         now = now + resolve_time_delta(cfg)
 66 |     count = df.filter(df["trx_date"].cast(TimestampType()) >= lit(now)).count()
 67 |     if count > 0:
 68 |         raise ValidationException(f'Column {col_name} has values in the future (beyond {now}).')
 69 | 
 70 | 
 71 | def future_validator(df: DataFrame, col_name: str, cfg: any = None):
 72 |     now = datetime.now()
 73 |     if cfg and hasattr(cfg, 'threshold'):
 74 |         now = now - resolve_time_delta(cfg)
 75 |     count = df.filter(df["trx_date"].cast(TimestampType()) <= lit(now)).count()
 76 |     if count > 0:
 77 |         raise ValidationException(f'Column {col_name} has values in the past (before {now}).')
 78 | 
 79 | 
 80 | def freshness_validator(df: DataFrame, col_name: str, cfg: any = None):
 81 |     regex = re.compile('seconds|minutes|hours|days|weeks', re.I)
 82 |     if not hasattr(cfg, 'threshold') or not hasattr(cfg, 'time_unit') or not regex.match(str(cfg.time_unit)):
 83 |         raise ValidationException(
 84 |             f'[threshold] and [time_unit] options must be specified. Time units shoudl have one of the following values: seconds|minutes|hours|days|weeks.')
 85 |     if hasattr(cfg, 'group_by'):
 86 |         # df.withColumn("rn", row_number().over(Window.partitionBy(cfg.group_by).orderBy(col(col_name).desc())))
 87 |         # df = df.filter(col("rn") == 1).drop("rn")
 88 |         res_df = df.select(col(col_name), col(cfg.group_by)).withColumn('rn', row_number().over(
 89 |             Window.partitionBy(cfg.group_by).orderBy(col(col_name).desc()))).filter(col('rn') == 1).drop('rn')
 90 |         threshold = datetime.now() - resolve_time_delta(cfg)
 91 |         for row in res_df.collect():
 92 |             if row[col_name] < threshold:
 93 |                 raise ValidationException(
 94 |                     f'The most recent row for group [{cfg.group_by}] is older ({row[col_name]}) than the threshold ({threshold}).')
 95 |     else:
 96 |         threshold = datetime.now() - resolve_time_delta(cfg)
 97 |         most_recent = df.select(col(col_name)).orderBy(col(col_name).desc()).first()[col_name]
 98 |         if most_recent < threshold:
 99 |             raise ValidationException(f'The most recent row is older ({most_recent}) than the threshold ({threshold}).')
100 | 
101 | 
102 | def min_validator(df: DataFrame, col_name: str, cfg: any = None):
103 |     # todo: implement min validator
104 |     pass
105 | 
106 | 
107 | def max_validator(df: DataFrame, col_name: str, cfg: any = None):
108 |     # todo: implement max validator
109 |     pass
110 | 
111 | 
112 | constraint_validators = {
113 |     "not_null": null_validator,
114 |     "unique": unique_validator,
115 |     "regexp": regexp_validator,
116 |     "past": past_validator,
117 |     "future": future_validator,
118 |     "freshness": freshness_validator
119 | }
120 | 
121 | 
122 | def hasher(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame:
123 |     # todo: implement salting
124 |     return df.withColumn(col_name, hash(col(col_name)))
125 | 
126 | 
127 | def encrypt(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame:
128 |     # todo: implement key handling + kms
129 |     def encrypt_f(value: object, key: str = None):
130 |         if key:
131 |             return hashlib.sha256(str(value).encode() + key.encode()).hexdigest()
132 |         else:
133 |             return hashlib.sha256(str(value).encode()).hexdigest()
134 | 
135 |     encrypt_udf = udf(encrypt_f, StringType())
136 |     return df.withColumn(col_name, encrypt_udf(col_name, lit(cfg.key if hasattr(cfg, 'key') else None)))
137 | 
138 | 
139 | def skip_column(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame:
140 |     return df.drop(col(col_name))
141 | 
142 | 
143 | def rename_col(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame:
144 |     # todo: update the schema for the dataset or remove this one
145 |     return df.withColumnRenamed(col_name, cfg.name)
146 | 
147 | 
148 | def bucketize(df: DataFrame, col_name: str, cfg: any = None) -> DataFrame:
149 |     buckets = cfg.buckets.__dict__
150 |     bucket_labels = dict(zip(range(len(buckets.values())), buckets.values()))
151 |     bucket_splits = [float(split) for split in buckets.keys()]
152 |     bucket_splits.append(float('Inf'))
153 | 
154 |     bucketizer = Bucketizer(splits=bucket_splits, inputCol=col_name, outputCol="tmp_buckets")
155 |     bucketed = bucketizer.setHandleInvalid("keep").transform(df)
156 | 
157 |     udf_labels = udf(lambda x: bucket_labels[x], StringType())
158 |     bucketed = bucketed.withColumn(col_name, udf_labels("tmp_buckets"))
159 |     bucketed = bucketed.drop(col('tmp_buckets'))
160 | 
161 |     return bucketed
162 | 
163 | 
164 | built_in_transformers = {
165 |     'anonymize': hasher,
166 |     'encrypt': encrypt,
167 |     'skip': skip_column,
168 |     'bucketize': bucketize,
169 |     'rename_column': rename_col
170 | }
171 | 
172 | 
173 | def find_schema_delta(ds: DataSet) -> List[StructField]:
174 |     def lookup(name, schema_list):
175 |         return next(filter(lambda rsf: rsf.name == name, schema_list))
176 | 
177 |     if check_property(ds, 'model.columns'):
178 |         required_schema = common.remap_schema(ds)
179 |         data_frame_fields = [{'name': x.name, 'type': x.dataType} for x in ds.df.schema]
180 |         required_schema_fields = [{'name': x.name, 'type': x.dataType} for x in required_schema]
181 |         delta_fields = [x for x in required_schema_fields if x not in data_frame_fields]
182 |         return [lookup(x.get('name'), required_schema) for x in delta_fields]
183 |     else:
184 |         return None
185 | 
186 | 
187 | def type_caster(ds: DataSet):
188 |     try:
189 |         mismatched_fields = find_schema_delta(ds)
190 |         for mismatched_field in mismatched_fields or []:
191 |             logger.info(
192 |                 f'--> typecasting [{mismatched_field.name}] to type: [{mismatched_field.dataType.typeName()}] in [{ds.id}]')
193 |             field_in_df = next(iter([f for f in ds.df.schema.fields if f.name == mismatched_field.name]), None)
194 |             if field_in_df:
195 |                 ds.df = ds.df.withColumn(mismatched_field.name,
196 |                                          col(mismatched_field.name).cast(mismatched_field.dataType.typeName()))
197 |         return ds
198 |     except Exception as e:
199 |         raise e
200 | 
201 | 
202 | def schema_checker(ds: DataSet):
203 |     if check_property(ds, 'model.columns'):
204 |         logger.info(
205 |             f'-> checking schema for dataset [{ds.id}] with model id: [{ds.model.id}]. Data frame columns: {len(ds.df.columns)}')
206 |         missing_fields = find_schema_delta(ds)
207 |         if missing_fields:
208 |             raise SchemaValidationException(
209 |                 f'The following fields are missing from the data set [{ds.id}]: {missing_fields}. '
210 |                 f'Current schema: {ds.df.schema}',
211 |                 ds)
212 |         if hasattr(ds, 'model') and hasattr(ds.model, 'validation') and ds.model.validation == 'strict':
213 |             if not hasattr(ds, 'df'):
214 |                 raise SchemaValidationException(f'The dataset [{ds.id}] is missing a dataframe with strict validation',
215 |                                                 ds)
216 |             if len(ds.df.columns) != len(ds.model.columns):
217 |                 xtra = set(ds.df.columns) - set([x.id for x in ds.model.columns])
218 |                 raise SchemaValidationException(
219 |                     f'The dataset [{ds.id}] has a dataframe with more columns ({xtra}) than stated in the model', ds)
220 |     return ds
221 | 
222 | 
223 | def razor(ds: DataSet):
224 |     if hasattr(ds.model, 'xtra_columns') and ds.model.xtra_columns == 'raze':
225 |         xtra_columns = list(set(ds.df.columns) - set([x.id for x in ds.model.columns]))
226 |         ds.df = ds.df.drop(*xtra_columns)
227 |     return ds
228 | 
229 | 
230 | def constraint_processor(ds: DataSet):
231 |     if not check_property(ds, 'model.columns'):
232 |         return ds
233 | 
234 |     for col in ds.model.columns:
235 |         if not hasattr(col, 'constraints'):
236 |             continue
237 |         constraint_types = [c.type for c in col.constraints]
238 |         for ctype in constraint_types:
239 |             cvalidator = constraint_validators.get(ctype)
240 |             if cvalidator:
241 |                 constraint = next(iter([co for co in col.constraints if co.type == ctype]), None)
242 |                 constraint_opts = constraint.options if hasattr(constraint, 'options') else None
243 |                 cvalidator(ds.df, col.id, constraint_opts)
244 |     return ds
245 | 
246 | 
247 | def transformer_processor(data_set: DataSet):
248 |     """
249 |     Will run a prebuilt a transformation on each and every column of the model.
250 |     :param data_set: the data set that contains the data frame;
251 |     :return: the data set with the processed data frame
252 |     """
253 |     if not check_property(data_set, 'model.columns'):
254 |         return data_set
255 |     for col in data_set.model.columns:
256 |         if not hasattr(col, 'transform'):
257 |             continue
258 |         transformers = [t.type for t in col.transform]
259 |         for trsfrm_type in transformers:
260 |             tcall = built_in_transformers.get(trsfrm_type)
261 |             if tcall:
262 |                 trsfrm = next(iter([to for to in col.transform if to.type == trsfrm_type]), None)
263 |                 trsfm_opts = trsfrm.options if trsfrm and hasattr(trsfrm, 'options') else None
264 |                 data_set.df = tcall(data_set.df, col.id, trsfm_opts)
265 |     return data_set
266 | 


--------------------------------------------------------------------------------
/driver/schema/1.rc-1/model.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "$id": "http://json-schema.org/draft-07/schema#",
  4 |   "title": "Data product schema",
  5 |   "type": "object",
  6 |   "required": [
  7 |     "models",
  8 |     "schema_version"
  9 |   ],
 10 |   "additionalProperties": false,
 11 |   "properties": {
 12 |     "schema_version": {
 13 |       "type": "string",
 14 |       "description": "The version of this schema file"
 15 |     },
 16 |     "models": {
 17 |       "type": "array",
 18 |       "minItems": 1,
 19 |       "items": {
 20 |         "$ref": "#/$defs/model"
 21 |       }
 22 |     }
 23 |   },
 24 |   "$defs": {
 25 |     "model": {
 26 |       "type": "object",
 27 |       "additionalProperties": false,
 28 |       "required": [
 29 |         "id",
 30 |         "version",
 31 |         "columns"
 32 |       ],
 33 |       "properties": {
 34 |         "id": {
 35 |           "type": "string",
 36 |           "minLength": 1
 37 |         },
 38 |         "name": {
 39 |           "type": "string",
 40 |           "minLength": 1
 41 |         },
 42 |         "version": {
 43 |           "type": "string",
 44 |           "minLength": 1
 45 |         },
 46 |         "xtra_columns": {
 47 |           "type": "string",
 48 |           "enum": [
 49 |             "raze",
 50 |             "ignore"
 51 |           ]
 52 |         },
 53 |         "validation": {
 54 |           "type": "string",
 55 |           "enum": [
 56 |             "strict",
 57 |             "lazy"
 58 |           ]
 59 |         },
 60 |         "extends": {
 61 |           "type": "string",
 62 |           "minLength": 1
 63 |         },
 64 |         "description": {
 65 |           "type": "string",
 66 |           "minLength": 1
 67 |         },
 68 |         "meta": {
 69 |           "type": "object",
 70 |           "additionalProperties": true
 71 |         },
 72 |         "storage": {
 73 |           "$ref": "#/$defs/storage"
 74 |         },
 75 |         "tags": {
 76 |           "type": "object",
 77 |           "additionalProperties": true
 78 |         },
 79 |         "access": {
 80 |           "type": "object",
 81 |           "additionalProperties": true,
 82 |           "properties": {
 83 |             "domain": {
 84 |               "type": "string",
 85 |               "minLength": 1
 86 |             },
 87 |             "confidentiality": {
 88 |               "type": "string",
 89 |               "minLength": 1
 90 |             }
 91 |           }
 92 |         },
 93 |         "columns": {
 94 |           "type": "array",
 95 |           "minItems": 1,
 96 |           "items": {
 97 |             "$ref": "#/$defs/column"
 98 |           }
 99 |         }
100 |       }
101 |     },
102 |     "storage": {
103 |       "type": "object",
104 |       "required": [
105 |         "type"
106 |       ],
107 |       "additionalProperties": false,
108 |       "properties": {
109 |         "type": {
110 |           "type": "string"
111 |         },
112 |         "format": {
113 |           "type": "string"
114 |         },
115 |         "options": {
116 |           "type": "object",
117 |           "additionalProperties": true
118 |         },
119 |         "location": {
120 |           "type": "string",
121 |           "pattern": "([^ !$`&*()+]|(\\\\[ !$`&*()+]))+"
122 |         }
123 |       }
124 |     },
125 |     "column": {
126 |       "type": "object",
127 |       "additionalProperties": false,
128 |       "required": [
129 |         "id"
130 |       ],
131 |       "properties": {
132 |         "id": {
133 |           "type": "string",
134 |           "minLength": 1
135 |         },
136 |         "type": {
137 |           "type": "string",
138 |           "minLength": 1
139 |         },
140 |         "source": {
141 |           "type": [
142 |             "array",
143 |             "string"
144 |           ],
145 |           "items": {
146 |             "type": "string"
147 |           },
148 |           "minLength": 1
149 |         },
150 |         "name": {
151 |           "type": "string",
152 |           "minLength": 1
153 |         },
154 |         "description": {
155 |           "type": "string",
156 |           "minLength": 1
157 |         },
158 |         "transform": {
159 |           "type": "array",
160 |           "minItems": 1,
161 |           "items": {
162 |             "type": "object",
163 |             "properties": {
164 |               "type": {
165 |                 "type": "string",
166 |                 "minLength": 1
167 |               },
168 |               "options": {
169 |                 "type": "object"
170 |               }
171 |             }
172 |           }
173 |         },
174 |         "constraints": {
175 |           "type": "array",
176 |           "minItems": 1,
177 |           "items": {
178 |             "type": "object",
179 |             "properties": {
180 |               "type": {
181 |                 "type": "string",
182 |                 "minLength": 1
183 |               },
184 |               "options": {
185 |                 "type": "object"
186 |               }
187 |             }
188 |           }
189 |         }
190 |       }
191 |     }
192 |   }
193 | }


--------------------------------------------------------------------------------
/driver/schema/1.rc-1/product.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "$id": "http://json-schema.org/draft-07/schema#",
  4 |   "title": "Data Product Schema",
  5 |   "description": "Used to validate the product.yml that defines the data product execution pipeline",
  6 |   "type": "object",
  7 |   "required": [
  8 |     "schema_version",
  9 |     "product"
 10 |   ],
 11 |   "additionalProperties": false,
 12 |   "properties": {
 13 |     "schema_version": {
 14 |       "type": "string",
 15 |       "description": "The version of this schema file"
 16 |     },
 17 |     "product": {
 18 |       "type": "object",
 19 |       "required": [
 20 |         "id",
 21 |         "version",
 22 |         "owner",
 23 |         "pipeline",
 24 |         "description"
 25 |       ],
 26 |       "additionalProperties": false,
 27 |       "properties": {
 28 |         "id": {
 29 |           "type": "string",
 30 |           "minLength": 1
 31 |         },
 32 |         "version": {
 33 |           "type": "string",
 34 |           "minLength": 1
 35 |         },
 36 |         "owner": {
 37 |           "type": "string",
 38 |           "pattern": "^([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\\.[A-Z|a-z]{2,})+$"
 39 |         },
 40 |         "name": {
 41 |           "type": "string",
 42 |           "minLength": 1
 43 |         },
 44 |         "description": {
 45 |           "type": "string",
 46 |           "minLength": 1
 47 |         },
 48 |         "defaults": {
 49 |           "$ref": "#/$defs/defaults"
 50 |         },
 51 |         "engine": {
 52 |           "type": "string",
 53 |           "enum": [
 54 |             "glue",
 55 |             "emr",
 56 |             "dbt"
 57 |           ]
 58 |         },
 59 |         "pipeline": {
 60 |           "$ref": "#/$defs/pipeline"
 61 |         }
 62 |       }
 63 |     }
 64 |   },
 65 |   "$defs": {
 66 |     "defaults": {
 67 |       "type": "object"
 68 |     },
 69 |     "pipeline": {
 70 |       "type": "object",
 71 |       "required": [
 72 |         "schedule",
 73 |         "tasks"
 74 |       ],
 75 |       "additionalProperties": false,
 76 |       "properties": {
 77 |         "schedule": {
 78 |           "type": "string",
 79 |           "minLength": 1
 80 |         },
 81 |         "tasks": {
 82 |           "type": "array",
 83 |           "items": {
 84 |             "$ref": "#/$defs/task"
 85 |           }
 86 |         }
 87 |       }
 88 |     },
 89 |     "task": {
 90 |       "type": "object",
 91 |       "required": [
 92 |         "id",
 93 |         "inputs",
 94 |         "outputs"
 95 |       ],
 96 |       "additionalProperties": false,
 97 |       "properties": {
 98 |         "id": {
 99 |           "type": "string",
100 |           "minLength": 1
101 |         },
102 |         "logic": {
103 |           "$ref": "#/$defs/task_logic"
104 |         },
105 |         "inputs": {
106 |           "type": "array",
107 |           "items": {
108 |             "anyOf": [
109 |               {
110 |                 "$ref": "#/$defs/io_type_connection"
111 |               },
112 |               {
113 |                 "$ref": "#/$defs/io_type_model"
114 |               },
115 |               {
116 |                 "$ref": "#/$defs/io_type_file"
117 |               }
118 |             ]
119 |           }
120 |         },
121 |         "outputs": {
122 |           "type": "array",
123 |           "items": {
124 |             "anyOf": [
125 |               {
126 |                 "$ref": "#/$defs/io_type_connection"
127 |               },
128 |               {
129 |                 "$ref": "#/$defs/io_type_model"
130 |               },
131 |               {
132 |                 "$ref": "#/$defs/io_type_file"
133 |               }
134 |             ]
135 |           }
136 |         }
137 |       }
138 |     },
139 |     "task_logic": {
140 |       "type": "object",
141 |       "additionalProperties": false,
142 |       "required": [
143 |         "module"
144 |       ],
145 |       "properties": {
146 |         "module": {
147 |           "type": "string",
148 |           "minLength": 1
149 |         },
150 |         "parameters": {
151 |           "type": "object",
152 |           "additionalProperties": true
153 |         }
154 |       }
155 |     },
156 |     "io_type_connection": {
157 |       "type": "object",
158 |       "required": [
159 |         "connection",
160 |         "table"
161 |       ],
162 |       "additionalProperties": true,
163 |       "properties": {
164 |         "connection": {
165 |           "type": "string",
166 |           "minLength": 1
167 |         },
168 |         "table": {
169 |           "type": "string",
170 |           "minLength": 1
171 |         },
172 |         "model": {
173 |           "type": "string",
174 |           "minLength": 1
175 |         }
176 |       }
177 |     },
178 |     "io_type_model": {
179 |       "type": "object",
180 |       "required": [
181 |         "model"
182 |       ],
183 |       "additionalProperties": false,
184 |       "properties": {
185 |         "model": {
186 |           "type": "string",
187 |           "mindLength": 1
188 |         }
189 |       }
190 |     },
191 |     "io_type_file": {
192 |       "type": "object",
193 |       "required": [
194 |         "file"
195 |       ],
196 |       "additionalProperties": true,
197 |       "properties": {
198 |         "file": {
199 |           "type": "string",
200 |           "minLength": 1
201 |         },
202 |         "model": {
203 |           "type": "string",
204 |           "minLength": 1
205 |         },
206 |         "options": {
207 |           "type": "object",
208 |           "additionalProperties": false,
209 |           "properties": {
210 |             "type": {
211 |               "type": "string",
212 |               "minLength": 3
213 |             },
214 |             "infer_schema": {
215 |               "type": "boolean"
216 |             },
217 |             "separator": {
218 |               "type": "string",
219 |               "minLength": 1
220 |             },
221 |             "header": {
222 |               "type": "boolean"
223 |             }
224 |           }
225 |         }
226 |       }
227 |     }
228 |   }
229 | }
230 | 


--------------------------------------------------------------------------------
/driver/task_executor.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import importlib
  5 | import logging
  6 | 
  7 | import sys
  8 | 
  9 | from typing import List, Callable, Dict
 10 | from .util import filter_list_by_id, enrich_models
 11 | from .core import DataSet, DataProduct, IOType, ProcessorChainExecutionException, ValidationException, \
 12 |         resolve_data_set_id, ResolverException, resolve_data_product_id, ConfigContainer
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | data_src_handlers: dict = dict()
 17 | pre_processors: list = list()
 18 | post_processors: list = list()
 19 | transformers: dict = dict()
 20 | output_handlers: dict = dict()
 21 | 
 22 | 
 23 | def register_data_source_handler(src_type_id: str, handler: callable):
 24 |     data_src_handlers.update({src_type_id: handler})
 25 | 
 26 | 
 27 | def register_preprocessors(*handlers: callable):
 28 |     pre_processors.extend(handlers)
 29 | 
 30 | 
 31 | def register_postprocessors(*handlers: callable):
 32 |     post_processors.extend(handlers)
 33 | 
 34 | 
 35 | def register_transformer(transformer_id: str, handler: callable):
 36 |     transformers.update({transformer_id: handler})
 37 | 
 38 | 
 39 | def add_transformers(additional_transformers: Dict[str, callable]):
 40 |     transformers.update(additional_transformers)
 41 | 
 42 | 
 43 | def register_output_handler(output_handler_type: str, handler: callable):
 44 |     output_handlers.update({output_handler_type: handler})
 45 | 
 46 | 
 47 | def resolve_io_type(io_definition: ConfigContainer) -> IOType:
 48 |     if hasattr(io_definition, IOType.connection.name):
 49 |         return IOType.connection
 50 |     elif hasattr(io_definition, IOType.file.name):
 51 |         return IOType.file
 52 |     elif hasattr(io_definition, IOType.model.name):
 53 |         return IOType.model
 54 |     else:
 55 |         raise ResolverException(f'This IO type  is not supported yet: {io_definition.__repr__()}.')
 56 | 
 57 | 
 58 | def load_inputs(product: ConfigContainer, inputs: ConfigContainer, models: List[ConfigContainer]) -> List[DataSet]:
 59 |     input_datasets: list[DataSet] = list()
 60 | 
 61 |     def load_input(input_def):
 62 |         handle_input = data_src_handlers.get(input_def.type)
 63 |         if not handle_input:
 64 |             raise Exception(f"Input source handler [{input_def.type}] not registered.")
 65 |         return handle_input(input_def)
 66 | 
 67 |     for inp in inputs:
 68 |         model_id = inp.model if hasattr(inp, 'model') else None
 69 |         setattr(inp, 'type', resolve_io_type(inp))
 70 | 
 71 |         # dataset_id is build as follows
 72 |         # file:         <assigned model id OR filename without filetype>
 73 |         # model:        <data product id>.<model id>
 74 |         # connection:   <connection id>.<table name>
 75 |         data_product_id = resolve_data_product_id(inp)
 76 |         dataset_id = f'{data_product_id}.{resolve_data_set_id(inp)}' if data_product_id else resolve_data_set_id(inp)
 77 | 
 78 |         model_obj = filter_list_by_id(models, model_id)
 79 | 
 80 |         dp = DataProduct(id=product.id, description=getattr(product, 'description', None),
 81 |                          owner=getattr(product, 'owner', None))
 82 |         input_datasets.append(DataSet(dataset_id, load_input(inp), model_obj, dp))
 83 |     return input_datasets
 84 | 
 85 | 
 86 | def run_processors(phase: str, datasets: List[DataSet], processors: List[Callable]) -> List[DataSet]:
 87 |     try:
 88 |         processed_dfs: list[datasets] = datasets
 89 |         for processor in processors:
 90 |             logger.info(f'-> running processor: [{processor.__name__}]')
 91 |             new_dss: list[datasets] = list()
 92 |             for ds in processed_dfs:
 93 |                 new_dss.append(processor(ds))
 94 |             processed_dfs = new_dss
 95 |         return processed_dfs
 96 |     except ValidationException as vex:
 97 |         raise ProcessorChainExecutionException(
 98 |             f'{type(vex).__name__} in processor [{processor.__name__}] at processor chain: [{phase}]: {str(vex)}') from vex
 99 |     except Exception as e:
100 |         raise ProcessorChainExecutionException(
101 |             f'{type(e).__name__} in [{processor.__name__}] at processor chain: [{phase}]: {str(e)}') from e
102 | 
103 | 
104 | def transform(inp_dfs: List[DataSet], product_path: str, custom_module_name, params=None) -> List[DataSet]:
105 |     from driver.driver import get_spark
106 |     sys.path.append(product_path)
107 |     logger.info(f'executing custom module: {custom_module_name}')
108 |     custom_module = importlib.import_module(custom_module_name)
109 |     sys.modules[custom_module_name] = custom_module
110 | 
111 |     spark = get_spark()
112 |     if params:
113 |         return custom_module.execute(inp_dfs, spark, **params)
114 |     else:
115 |         return custom_module.execute(inp_dfs, spark)
116 | 
117 | 
118 | def sink(o_dfs: List[DataSet]):
119 |     for out_dataset in o_dfs:
120 |         handle_output = output_handlers.get(out_dataset.storage_type)
121 |         if not handle_output:
122 |             raise Exception(f'Storage handler identified by {out_dataset.storage_type} is not found.')
123 |         handle_output(out_dataset)
124 | 
125 | 
126 | def enrich(datasets: List[DataSet], product: ConfigContainer, models: List[ConfigContainer]):
127 |     for dataset in datasets:
128 |         if not dataset.product_id:
129 |             dataset.product_id = product.id
130 |         if not dataset.product_owner:
131 |             dataset.product.owner = getattr(product, 'owner', None)
132 |         if dataset.model is None:
133 |             default_model = enrich_models(ConfigContainer(models=[ConfigContainer(id=dataset.id)]), product=product)[0]
134 |             model_obj = next(iter([m for m in models if m.id == dataset.id]), default_model)
135 |             dataset.model = model_obj
136 |     return datasets
137 | 
138 | 
139 | def filter_output_models(task_outputs: List[ConfigContainer], models: List[ConfigContainer]):
140 |     output_model_names = [to.model for to in task_outputs if hasattr(to, 'model')]
141 |     return [model for model in models if model.id in output_model_names]
142 | 
143 | 
144 | def execute(product: ConfigContainer, task: ConfigContainer, models: List[ConfigContainer], product_path: str) \
145 |         -> List[DataSet]:
146 |     logger.info(f'executing tasks > [{task.id}] for data product [{product.id}].')
147 | 
148 |     output_models = filter_output_models(task.outputs, models)
149 |     input_dfs: list[DataSet] = run_processors('pre', load_inputs(product, task.inputs, models), pre_processors)
150 |     input_dfs = enrich(input_dfs, product, output_models)
151 | 
152 |     task_logic_module = task.logic.module if hasattr(task, 'logic') and hasattr(task.logic,
153 |                                                                                 'module') else 'builtin.ingest'
154 |     task_logic_params = task.logic.parameters.__dict__ if hasattr(task, 'logic') and hasattr(task.logic,
155 |                                                                                              'parameters') else {}
156 |     output_dfs: list[DataSet] = transform(input_dfs, product_path, task_logic_module, task_logic_params)
157 |     output_dfs = enrich(output_dfs, product, output_models)
158 | 
159 |     sink(run_processors('post', output_dfs, post_processors))
160 | 
161 |     return output_dfs
162 | 


--------------------------------------------------------------------------------
/driver/util.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import functools
  5 | from io import DEFAULT_BUFFER_SIZE
  6 | import json
  7 | import logging
  8 | import os
  9 | import yaml
 10 | 
 11 | from typing import List, Any
 12 | from jsonschema import validate, ValidationError, Draft3Validator
 13 | from yaml.scanner import ScannerError
 14 | from driver.core import ArtefactType, ConfigContainer
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def run_chain(input_payload, *callables: callable):
 20 |     functions = list()
 21 |     functions.extend(callables)
 22 |     result = input_payload
 23 |     for func in functions:
 24 |         func_name = func.func.__name__ if isinstance(func, functools.partial) else func.__name__
 25 |         logger.info(f'chain > executing: {func_name}')
 26 |         try:
 27 |             result = func(result)
 28 |         except Exception as exc:
 29 |             logger.error(f'{type(exc).__name__} while executing <{func_name}> with error: {str(exc)}')
 30 |             raise
 31 |     return result
 32 | 
 33 | 
 34 | def parse_dict_into_object(d: dict):
 35 |     x = ConfigContainer()
 36 |     for k, v in d.items():
 37 |         if isinstance(v, dict):
 38 |             setattr(x, k, parse_dict_into_object(v))
 39 |         elif isinstance(v, list):
 40 |             object_list = list()
 41 |             for e in v:
 42 |                 object_list.append(parse_dict_into_object(e) if isinstance(e, dict) else e)
 43 |             setattr(x, k, object_list)
 44 |         else:
 45 |             setattr(x, str(k), v)
 46 |     return x
 47 | 
 48 | 
 49 | def load_yaml(file_path: str):
 50 |     logger.info(f'loading file {file_path}')
 51 |     try:
 52 |         with open(fr'{file_path}') as file:
 53 |             return yaml.safe_load(file)
 54 |     except ScannerError as scerr:
 55 |         logger.error(f'Could not read [{file_path}] due to: {str(scerr)}')
 56 |         raise scerr
 57 | 
 58 | 
 59 | def safe_get_property(object: Any, property: str):
 60 |     return getattr(object, property) if hasattr(object, property) else None
 61 | 
 62 | 
 63 | def check_property(object, nested_property: str):
 64 |     """
 65 |     :param object: the object to analyze
 66 |     :param nested_property: the nested properties separated by dots (.) (eg. model.storage.location)
 67 |     :return: True if the nested property can be found on the object;
 68 |     """
 69 |     current_object = object
 70 |     for element in nested_property.split('.'):
 71 |         if hasattr(current_object, element):
 72 |             current_object = getattr(current_object, element)
 73 |         else:
 74 |             return False
 75 |     return True
 76 | 
 77 | 
 78 | def filter_list_by_id(object_list, object_id):
 79 |     return next(iter([m for m in object_list if m.id == object_id]), None)
 80 | 
 81 | 
 82 | def validate_schema(validable_dict: dict, artefact_type: ArtefactType):
 83 |     schema_vesion = validable_dict.get('schema_version')
 84 |     if not schema_vesion:
 85 |         raise ValidationError('schema_version keyword must be provided')
 86 |     script_folder = os.path.dirname(os.path.abspath(__file__))
 87 |     schema_path = os.path.join(script_folder, 'schema', schema_vesion, f'{artefact_type.name}.json')
 88 |     with open(schema_path) as schema:
 89 |         schema = json.load(schema)
 90 |     try:
 91 |         validate(validable_dict, schema)
 92 |     except ValidationError as verr:
 93 |         for err in sorted(Draft3Validator(schema).iter_errors(validable_dict), key=str):
 94 |             logger.error(f'validation error detail: {err.message}')
 95 |         logger.error(f"{type(verr).__name__} while checking [{artefact_type.name}]: {str(verr)}")
 96 |         raise verr
 97 |     return validable_dict
 98 | 
 99 | 
100 | def enrich_product(product_input: ConfigContainer, args):
101 |     # todo: replace this with a proper object merging logic
102 |     product = product_input.product
103 |     if not hasattr(product, 'defaults'):
104 |         setattr(product, 'defaults', ConfigContainer())
105 |     if hasattr(args, 'default_data_lake_bucket') and not hasattr(product.defaults, 'storage'):
106 |         storage = ConfigContainer()
107 |         setattr(storage, 'location', args.default_data_lake_bucket)
108 |         logger.debug(f'product defaults {product.defaults}')
109 |         setattr(product.defaults, 'storage', storage)
110 |     if not check_property(product, 'defaults.storage.location'):
111 |         setattr(product.defaults.storage, 'location', args.default_data_lake_bucket)
112 |     return product
113 | 
114 | 
115 | def enrich_models(models: ConfigContainer, product: ConfigContainer):
116 |     def add_back_types(model, extended_model):
117 |         columns_with_missing_type = [col for col in model.columns if not hasattr(col, 'type')]
118 |         for col in columns_with_missing_type:
119 |             setattr(col, 'type', filter_list_by_id(extended_model.columns, col.id).type)
120 | 
121 |     def decorate_model_with_defaults(model):
122 |         if hasattr(product, 'defaults'):
123 |             if not hasattr(model, 'storage') and hasattr(product.defaults, 'storage'):
124 |                 setattr(model, 'storage', product.defaults.storage)
125 |             if not hasattr(model.storage, 'location') and hasattr(product.defaults.storage, 'location'):
126 |                 setattr(model.storage, 'location', product.defaults.storage.location)
127 |             if not hasattr(model.storage, 'options') and hasattr(product.defaults.storage, 'options'):
128 |                 setattr(model.storage, 'options', product.defaults.storage.options)
129 |         if not hasattr(model.storage, 'type'):
130 |             setattr(model.storage, 'type', 'lake')
131 |         if not hasattr(model.storage, 'format'):
132 |             setattr(model.storage, 'format', 'parquet')
133 |         return model
134 | 
135 |     compiled_models = list()
136 |     for model in models.models:
137 |         if hasattr(model, 'extends'):
138 |             extended_model = filter_list_by_id(models.models, model.extends)
139 |             if not extended_model:
140 |                 raise Exception(
141 |                     f'Cannot extend model {model.id} with {extended_model} because the root model is not found.')
142 |             current_model_columns = set([col.id for col in model.columns])
143 |             extended_model_columns = set([col.id for col in extended_model.columns])
144 |             inherited_column_ids = extended_model_columns - current_model_columns
145 |             inherited_columns = [filter_list_by_id(extended_model.columns, col_id) for col_id in inherited_column_ids]
146 |             model.columns.extend(inherited_columns)
147 |             add_back_types(model, extended_model)
148 |         compiled_models.append(decorate_model_with_defaults(model))
149 |     return compiled_models
150 | 
151 | 
152 | def compile_product(product_path: str, args, prod_def_filename: str = 'product.yml'):
153 |     part_enrich_product = functools.partial(enrich_product, args=args)
154 |     part_validate_schema = functools.partial(validate_schema, artefact_type=ArtefactType.product)
155 |     product_path = os.path.join(product_path, prod_def_filename)
156 |     product_processing_chain = [load_yaml, part_validate_schema, parse_dict_into_object, part_enrich_product]
157 |     return run_chain(product_path, *product_processing_chain)
158 | 
159 | 
160 | def compile_models(product_path: str, product: ConfigContainer, def_file_name: str = 'model.yml') -> List[
161 |     ConfigContainer]:
162 |     model_path = os.path.join(product_path, def_file_name)
163 |     part_validate_schema = functools.partial(validate_schema, artefact_type=ArtefactType.model)
164 |     part_enrich_model = functools.partial(enrich_models, product=product)
165 |     model_processing_chain = [load_yaml, part_validate_schema, parse_dict_into_object, part_enrich_model]
166 |     return run_chain(model_path, *model_processing_chain)
167 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import configparser
  5 | import importlib
  6 | import logging
  7 | import os
  8 | import argparse
  9 | import sys
 10 | 
 11 | from pyspark import SparkConf
 12 | import traceback
 13 | import driver
 14 | import driver.aws.providers
 15 | from driver.aws.providers import connection_provider, datalake_provider
 16 | from driver.driver import get_spark
 17 | from driver.io_handlers import connection_input_handler, lake_input_handler, file_input_handler
 18 | from driver.processors import schema_checker, constraint_processor, transformer_processor, type_caster, razor
 19 | from driver.io_handlers import lake_output_handler, connection_input_handler
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def init_aws(args):
 25 |     profile = None
 26 |     region = None
 27 |     if hasattr(args, 'aws_profile'):
 28 |         profile = args.aws_profile
 29 |     if hasattr(args, 'aws_region'):
 30 |         region = args.aws_region
 31 |     driver.aws.providers.init(profile=profile, region=region)
 32 | 
 33 | 
 34 | def build_spark_configuration(args, config: configparser.RawConfigParser, custom_hook: callable = None):
 35 |     conf = SparkConf()
 36 |     if hasattr(args, 'aws_profile'):
 37 |         logger.info(f'Setting aws profile: {args.aws_profile}')
 38 |         os.environ["AWS_PROFILE"] = args.aws_profile
 39 |         conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider")
 40 |     if hasattr(args, 'local') and args.local:
 41 |         """ local execution, dependencies should be configured """
 42 |         deps_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'spark_deps')
 43 |         local_jars = [file for file in os.listdir(deps_path) if file.endswith('.jar')]
 44 |         if hasattr(args, 'jars'):
 45 |             local_jars.extend([f'{deps_path}/{j}' for j in args.jars.strip().split(',')])
 46 |         jars = ','.join([os.path.join(deps_path, j) for j in local_jars])
 47 |         conf.set("spark.jars", jars)
 48 |     if config:
 49 |         spark_jars = 'spark jars'
 50 |         if spark_jars in config.sections():
 51 |             for k, v in config.items(spark_jars):
 52 |                 conf.set(k, v)
 53 |     return custom_hook.enrich_spark_conf(conf) if custom_hook and hasattr(custom_hook, 'enrich_spark_conf') else conf
 54 | 
 55 | 
 56 | def read_config(product_path: str) -> configparser.RawConfigParser:
 57 |     config_path = os.path.join(product_path, 'config.ini')
 58 |     if os.path.isfile(config_path):
 59 |         config = configparser.ConfigParser()
 60 |         config.read(config_path)
 61 |         return config
 62 |     else:
 63 |         return None
 64 | 
 65 | 
 66 | def get_custom_hook(product_path: str) -> callable:
 67 |     hook_module_name = 'init_hook'
 68 |     hook_file = f'{hook_module_name}.py'
 69 |     hook_file_name = os.path.join(product_path, hook_file)
 70 |     if os.path.exists(hook_file_name):
 71 |         sys.path.append(product_path)
 72 |         logger.info(f'executing custom hooks: {hook_file_name}')
 73 |         module = importlib.import_module(hook_module_name)
 74 |         sys.modules[hook_module_name] = module
 75 |         return module
 76 |     else:
 77 |         return None
 78 | 
 79 | 
 80 | def init_system(args):
 81 |     driver.io_handlers.init(connection_provider, datalake_provider)
 82 |     rel_product_path = os.path.join(args.product_path, '') if hasattr(args, 'product_path') else os.path.join('./', '')
 83 |     product_path = os.path.join(os.path.abspath(rel_product_path), '')
 84 |     config = read_config(product_path)
 85 |     custom_hook = get_custom_hook(product_path)
 86 |     driver.init(spark_config=build_spark_configuration(args, config, custom_hook))
 87 |     logger.debug(f'using Spark configuration: {get_spark().sparkContext.getConf().getAll()}')
 88 |     logger.debug(f'the following jar packages are deployed: {get_spark().sparkContext._jsc.sc().listJars()}')
 89 |     driver.install_dependencies(product_path)
 90 |     driver.register_data_source_handler('connection', connection_input_handler)
 91 |     driver.register_data_source_handler('model', lake_input_handler)
 92 |     driver.register_data_source_handler('file', file_input_handler)
 93 |     driver.register_postprocessors(transformer_processor, razor, constraint_processor, type_caster, schema_checker)
 94 |     driver.register_output_handler('default', lake_output_handler)
 95 |     driver.register_output_handler('lake', lake_output_handler)
 96 |     if custom_hook:
 97 |         if hasattr(custom_hook, 'add_post_processors'):
 98 |             driver.register_postprocessors(*custom_hook.add_post_processors())
 99 |         if hasattr(custom_hook, 'add_pre_processors'):
100 |             driver.register_preprocessors(*custom_hook.add_pre_processors())
101 |         # if hasattr(custom_hook, 'add_transformers'):
102 |         #     driver.add_transformers(custom_hook.add_transformers())
103 |         # todo: the transformer dict is not used, the processor built-in transformers are the only ones looked up now
104 |     driver.process_product(args, product_path)
105 | 
106 | 
107 | def main():
108 |     try:
109 |         parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
110 |         parser.add_argument('--JOB_ID', help='the unique id of this Glue job')
111 |         parser.add_argument('--JOB_RUN_ID', help='the unique id of this Glue job run')
112 |         parser.add_argument('--JOB_NAME', help='the name of this Glue job')
113 |         parser.add_argument('--job-bookmark-option', help="job-bookmark-disable if you don't want bookmarking")
114 |         parser.add_argument('--TempDir', help='temporary results directory')
115 |         parser.add_argument('--product_path', help='the data product definition folder')
116 |         parser.add_argument('--aws_profile', help='the AWS profile to be used for connection')
117 |         parser.add_argument('--aws_region', help='the AWS region to be used')
118 |         parser.add_argument('--local', action='store_true', help='local development')
119 |         parser.add_argument('--jars', help='extra jars to be added to the Spark context')
120 |         parser.add_argument('--additional-python-modules', help='this is used by Glue, ignored by this code')
121 |         parser.add_argument('--default_data_lake_bucket', help='Data Mesh output S3 bucket name', default=None)
122 |         parser.add_argument('--log_level', choices=['debug', 'info', 'warning'], help='Set the desired log level', default='info')
123 |         args, unknown = parser.parse_known_args()
124 |         logging.basicConfig()
125 |         log_level = logging.getLevelName(args.log_level.upper())
126 |         logger.setLevel(log_level)
127 |         logging.getLogger('driver').setLevel(log_level)
128 |         logger.info(f"KNOWN_ARGS: {args}")
129 |         logger.info(f"UNKNOWN_ARGS: {unknown}")
130 |         logger.info(f'PATH: {os.environ["PATH"]}')
131 |         logger.info(f'SPARK_HOME: {os.environ.get("SPARK_HOME")}')
132 |         logger.info(f'PYTHONPATH: {os.environ.get("PYTHONPATH")}')
133 | 
134 |         init_aws(args)
135 |         if hasattr(args, "JOB_NAME") and not (hasattr(args, 'local') and args.local):
136 |             import zipfile
137 | 
138 |             with zipfile.ZipFile(f'{os.path.dirname(os.path.abspath(__file__))}/{args.JOB_NAME}.zip', 'r') as zip_ref:
139 |                 zip_ref.extractall(f'{os.path.dirname(os.path.abspath(__file__))}/')
140 |         init_system(args=args)
141 |     except Exception as e:
142 |         logging.exception(e)
143 |         traceback.print_exc()
144 |         raise e
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     main()
149 | 


--------------------------------------------------------------------------------
/package.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | [pytest]
 5 | addopts = -v --durations=0
 6 | log_cli = 1
 7 | log_cli_level = INFO
 8 | #addopts = -v --durations=0 -s --log-cli-level 1
 9 | testpaths = tests
10 | python_files = test_*.py *_test.py *_tests.py
11 | #env = PYTHONPATH=/glue-libs/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/python:/glue-libs/aws-glue-libs/target/AWSGlueETLPython-1.0.0.jar:$PYTHONPATH
12 | markers =
13 |     integration: test which require connection to real resources
14 |     ignore: tests which should not run
15 | mock_use_standalone_module = false
16 | #nosecuredirs=tests/retired/


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | -r requirements.txt
 5 | 
 6 | pyspark
 7 | pyspark-stubs
 8 | pytest-spark
 9 | pytest-mock
10 | pytest-helpers-namespace
11 | pytest-env
12 | pytest-cov
13 | pytest
14 | numpy
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | boto3==1.18.34
 5 | botocore
 6 | wheel==0.38.1
 7 | pyyaml==5.4.1
 8 | pydantic==1.10.10
 9 | quinn
10 | boto3-stubs[glue]==1.18.34
11 | mypy-boto3-glue==1.18.34
12 | jsonschema==3.0.2
13 | pyspark==3.4.0
14 | numpy>=1.19.5
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.0.4
 3 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?P<pre>.*)
 4 | serialize = 
 5 | 	{major}.{minor}.{patch}{pre}
 6 | 	{major}.{minor}.{patch}
 7 | 
 8 | [bumpversion:file:setup.py]
 9 | 
10 | [bumpversion:file:version.sh]
11 | 
12 | [bdist_wheel]
13 | universal = 0
14 | 
15 | [aliases]
16 | test = pytest
17 | 
18 | [metadata]
19 | description-file = README.md
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | from os import path
 4 | from pip._internal.req import parse_requirements
 5 | from setuptools import setup, find_packages, Command
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
10 |     long_description = f.read()
11 | 
12 | requirements = [str(ir.requirement) for ir in parse_requirements(
13 |     'requirements.txt', session=False)]
14 | 
15 | 
16 | class CleanCommand(Command):
17 |     user_options = []
18 | 
19 |     def initialize_options(self):
20 |         pass
21 | 
22 |     def finalize_options(self):
23 |         pass
24 | 
25 |     def run(self):
26 |         system(
27 |             'rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info ./htmlcov '
28 |             './spark-warehouse ./driver/spark-warehouse ./metastore_db ./coverage_html ./.pytest_cache ./derby.log ./tests/local_results ./tasks/__pycache__')
29 | 
30 | 
31 | setup(
32 |     name="data-product-processor",
33 |     version="1.0.4",
34 |     description="The data product processor (dpp) is a library for dynamically creating and executing Apache Spark Jobs based on a declarative description of a data product.",
35 |     long_description=long_description,
36 |     long_description_content_type='text/markdown',
37 |     author="Amazon Web Services",
38 |     url = 'https://github.com/aws-samples/dpac-data-product-processor',
39 |     packages=find_packages(
40 |         exclude=(
41 |             "contrib",
42 |             "docs",
43 |             "tests",
44 |         )
45 |     ),
46 |     py_modules=[
47 |         'main',
48 |     ],
49 |     install_requires=requirements,
50 |     include_package_data=True,
51 |     platforms="any",
52 |     license="Apache License 2.0",
53 |     zip_safe=False,
54 |     cmdclass={
55 |         'clean_all': CleanCommand,
56 |         # 'package': Package
57 |     },
58 |     entry_points={
59 |         "console_scripts": ["data-product-processor=main:main"],
60 |     },
61 | )
62 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/aws_glue_dc_connection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Connection": {
 3 |     "ConnectionProperties": {
 4 |       "JDBC_CONNECTION_URL": "jdbc:postgresql://deng-pub-immersion-day.cofepwz7osto.eu-west-1.rds.amazonaws.com:5432/sportstickets",
 5 |       "JDBC_ENFORCE_SSL": "false",
 6 |       "PASSWORD": "some_pass",
 7 |       "USERNAME": "some_username"
 8 |     },
 9 |     "ConnectionType": "JDBC",
10 |     "CreationTime": "2021-08-11 15:44:08.285000+02:00",
11 |     "Description": "The connection to the test database",
12 |     "LastUpdatedTime": "2021-08-11 15:44:08.285000+02:00",
13 |     "Name": "test_db_connection",
14 |     "PhysicalConnectionRequirements": {
15 |       "AvailabilityZone": "eu-central-1a",
16 |       "SecurityGroupIdList": [
17 |         "sg-00d42f53e8f8b1963"
18 |       ],
19 |       "SubnetId": "subnet-0cbf5810a184046ce"
20 |     }
21 |   },
22 |   "ResponseMetadata": {
23 |     "HTTPHeaders": {
24 |       "connection": "keep-alive",
25 |       "content-length": "566",
26 |       "content-type": "application/x-amz-json-1.1",
27 |       "date": "Mon, 30 Aug 2021 12:36:13 GMT",
28 |       "x-amzn-requestid": "8f3e12d5-9626-4407-b217-62ce80b8c460"
29 |     },
30 |     "HTTPStatusCode": 200,
31 |     "RequestId": "8f3e12d5-9626-4407-b217-62ce80b8c460",
32 |     "RetryAttempts": 0
33 |   }
34 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_dc_get_db_rsp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Database": {
 3 |     "CatalogId": "588942721560",
 4 |     "CreateTableDefaultPermissions": [
 5 |       {
 6 |         "Permissions": [
 7 |           "ALL"
 8 |         ],
 9 |         "Principal": {
10 |           "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
11 |         }
12 |       }
13 |     ],
14 |     "CreateTime": "2021-09-03 12:27:22+02:00",
15 |     "Name": "customers"
16 |   },
17 |   "ResponseMetadata": {
18 |     "HTTPHeaders": {
19 |       "connection": "keep-alive",
20 |       "content-length": "246",
21 |       "content-type": "application/x-amz-json-1.1",
22 |       "date": "Fri, 03 Sep 2021 15:57:29 GMT",
23 |       "x-amzn-requestid": "7f953ee9-9221-4953-8a0e-4dc3917cdfe2"
24 |     },
25 |     "HTTPStatusCode": 200,
26 |     "RequestId": "7f953ee9-9221-4953-8a0e-4dc3917cdfe2",
27 |     "RetryAttempts": 0
28 |   }
29 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_dc_get_db_rsps.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "DatabaseList": [
 3 |     {
 4 |       "CatalogId": "588942721560",
 5 |       "CreateTableDefaultPermissions": [
 6 |         {
 7 |           "Permissions": [
 8 |             "ALL"
 9 |           ],
10 |           "Principal": {
11 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
12 |           }
13 |         }
14 |       ],
15 |       "CreateTime": "2021-09-03 12:27:22+02:00",
16 |       "Name": "customers"
17 |     },
18 |     {
19 |       "CatalogId": "588942721560",
20 |       "CreateTableDefaultPermissions": [
21 |         {
22 |           "Permissions": [
23 |             "ALL"
24 |           ],
25 |           "Principal": {
26 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
27 |           }
28 |         }
29 |       ],
30 |       "CreateTime": "2021-08-27 09:29:58+02:00",
31 |       "Name": "sportstickets"
32 |     },
33 |     {
34 |       "CatalogId": "588942721560",
35 |       "CreateTableDefaultPermissions": [
36 |         {
37 |           "Permissions": [
38 |             "ALL"
39 |           ],
40 |           "Principal": {
41 |             "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
42 |           }
43 |         }
44 |       ],
45 |       "CreateTime": "2021-08-11 15:43:31+02:00",
46 |       "Description": "The Main Glue Database for the data Mesh",
47 |       "Name": "test_db",
48 |       "Parameters": {}
49 |     }
50 |   ],
51 |   "ResponseMetadata": {
52 |     "HTTPHeaders": {
53 |       "connection": "keep-alive",
54 |       "content-length": "795",
55 |       "content-type": "application/x-amz-json-1.1",
56 |       "date": "Fri, 03 Sep 2021 15:59:23 GMT",
57 |       "x-amzn-requestid": "7e2ba307-a94f-49e9-8c64-c5c283eae938"
58 |     },
59 |     "HTTPStatusCode": 200,
60 |     "RequestId": "7e2ba307-a94f-49e9-8c64-c5c283eae938",
61 |     "RetryAttempts": 0
62 |   }
63 | }


--------------------------------------------------------------------------------
/tests/assets/aws_api_rsps/glue_gc_get_table_rsp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Table": {
 3 |         "Name": "person",
 4 |         "DatabaseName": "test_db",
 5 |         "Owner": "owner",
 6 |         "CreateTime": "2021-09-04 23:22:51+02:00",
 7 |         "UpdateTime": "2021-09-05 13:19:54+02:00",
 8 |         "LastAccessTime": "2021-09-05 13:19:54+02:00",
 9 |         "Retention": 0,
10 |         "StorageDescriptor": {
11 |             "Columns": [
12 |                 {
13 |                     "Name": "id",
14 |                     "Type": "double"
15 |                 },
16 |                 {
17 |                     "Name": "full_name",
18 |                     "Type": "string"
19 |                 }
20 |             ],
21 |             "Location": "s3://glue-job-test-destination-bucket/person/",
22 |             "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
23 |             "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
24 |             "Compressed": false,
25 |             "NumberOfBuckets": -1,
26 |             "SerdeInfo": {
27 |                 "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
28 |                 "Parameters": {
29 |                     "serialization.format": "1"
30 |                 }
31 |             },
32 |             "BucketColumns": [],
33 |             "SortColumns": [],
34 |             "Parameters": {
35 |                 "CrawlerSchemaDeserializerVersion": "1.0",
36 |                 "CrawlerSchemaSerializerVersion": "1.0",
37 |                 "UPDATED_BY_CRAWLER": "customer_person",
38 |                 "averageRecordSize": "89",
39 |                 "classification": "parquet",
40 |                 "compressionType": "none",
41 |                 "objectCount": "7",
42 |                 "recordCount": "100",
43 |                 "sizeKey": "12647",
44 |                 "typeOfData": "file"
45 |             },
46 |             "StoredAsSubDirectories": false
47 |         },
48 |         "PartitionKeys": [
49 |             {
50 |                 "Name": "gender",
51 |                 "Type": "string"
52 |             },
53 |             {
54 |                 "Name": "age",
55 |                 "Type": "string"
56 |             }
57 |         ],
58 |         "TableType": "EXTERNAL_TABLE",
59 |         "Parameters": {
60 |             "CrawlerSchemaDeserializerVersion": "1.0",
61 |             "CrawlerSchemaSerializerVersion": "1.0",
62 |             "UPDATED_BY_CRAWLER": "customer_person",
63 |             "averageRecordSize": "89",
64 |             "classification": "parquet",
65 |             "compressionType": "none",
66 |             "objectCount": "7",
67 |             "recordCount": "100",
68 |             "sizeKey": "12647",
69 |             "typeOfData": "file"
70 |         },
71 |         "CreatedBy": "arn:aws:sts::588942721560:assumed-role/AWSGlueServiceRole-crawler/AWS-Crawler",
72 |         "IsRegisteredWithLakeFormation": false,
73 |         "CatalogId": "588942721560"
74 |     },
75 |     "ResponseMetadata": {
76 |         "RequestId": "84d46abf-4cf0-4b08-91b4-42c855ffeac6",
77 |         "HTTPStatusCode": 200,
78 |         "HTTPHeaders": {
79 |             "date": "Sun, 05 Sep 2021 15:16:50 GMT",
80 |             "content-type": "application/x-amz-json-1.1",
81 |             "content-length": "1606",
82 |             "connection": "keep-alive",
83 |             "x-amzn-requestid": "84d46abf-4cf0-4b08-91b4-42c855ffeac6"
84 |         },
85 |         "RetryAttempts": 0
86 |     }
87 | }


--------------------------------------------------------------------------------
/tests/assets/integration/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: person_pii
 4 |     version: "1.0.0"
 5 |     name: xxxxxxx
 6 |     description: A person, who can be a customer, including PII
 7 |     columns:
 8 |       - id: id
 9 |         type: integer
10 |         constraints:
11 |           - type: unique
12 |           - type: not_null
13 |       - id: first_name
14 |         type: string
15 |       - id: last_name
16 |         type: string
17 |       - id: full_name
18 |         type: string
19 |       - id: gender
20 |         type: string
21 |         constraints:
22 |           - type: not_null
23 |           - type: regexp
24 |             options:
25 |               value: '^male|female$'
26 |       - id: age
27 |         type: integer
28 |     meta:
29 |       contains_pii: true
30 |     storage:
31 |       type: lake
32 |       format: parquet
33 | #      location: 'glue-job-test-destination-bucket/person_pii'
34 |       options:
35 |         skip_first_row: true
36 |         partition_by:
37 |           - gender
38 |           - age
39 |         bucketed_at: 512M
40 |     tags:
41 |       cost_center: 123455
42 |       use_case: Customer 360
43 |     access:
44 |       domain: customer_support
45 |       confidentiality: private
46 | 
47 |   - id: person_pub
48 |     version: "1.0.0"
49 |     description: public personal data
50 |     extends: person_pii
51 |     columns:
52 |       - id: full_name
53 |         transform:
54 |           - type: encrypt
55 |       - id: first_name
56 |         transform:
57 |           - type: skip
58 |       - id: last_name
59 |         transform:
60 |           - type: skip
61 |       - id: age
62 |         type: string
63 |         transform:
64 |           - type: bucketize
65 |             options:
66 |               buckets:
67 |                 0: 0-19
68 |                 20: 20-39
69 |                 40: 40+
70 |     meta:
71 |       contains_pii: false
72 |     storage:
73 |       type: lake
74 | #      location: 'glue-job-test-destination-bucket/person_pub'
75 |       format: parquet
76 |       options:
77 |         skip_first_row: true
78 |         partition_by:
79 |           - gender
80 |           - age
81 |         bucketed_at: 512M
82 |     tags:
83 |       cost_center: 123455
84 |       use_case: Customer 360
85 |     access:
86 |       domain: customer_support
87 |       confidentiality: public
88 | 


--------------------------------------------------------------------------------
/tests/assets/integration/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: product_a_customers
 4 |   version: "1.0.0"
 5 |   defaults:
 6 |       storage:
 7 |         location:
 8 |       engine: glue
 9 |   owner: jane.doe@acme.com
10 |   name: Customers
11 |   description: All customer data
12 |   pipeline:
13 |     schedule: "0 */1 * * *"
14 |     tasks:
15 |       - id: extract customer data
16 |         logic:
17 |           module: tasks.custom_business_logic
18 |           parameters:
19 |             create_timestamp: true
20 |         inputs:
21 |           - connection: test_db_connection
22 |             table: dms_sample.person_relevant
23 |         outputs:
24 |           - model: person_pii
25 |           - model: person_pub
26 | #      - id: save anonymized version of customers
27 | #        inputs:
28 | #          - model:
29 | #        outputs:
30 | #          - model: person_pub
31 | 


--------------------------------------------------------------------------------
/tests/assets/integration/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration/tasks/custom_business_logic.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import datetime
 3 | from typing import List
 4 | 
 5 | from pyspark.sql.functions import concat, col, lit, unix_timestamp
 6 | from driver.common import find_dataset_by_id
 7 | from driver.task_executor import DataSet
 8 | 
 9 | 
10 | def execute(inp_dfs: List[DataSet], create_timestamp=False):
11 |     ds = find_dataset_by_id(inp_dfs, 'person_relevant')
12 | 
13 |     if create_timestamp:
14 |         timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
15 |         ds.df = ds.df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
16 | 
17 |     df = ds.df.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name')))
18 | 
19 |     ds_pub = DataSet(id='person_pub', df=df)
20 |     ds_pii = DataSet(id='person_pii', df=df)
21 | 
22 | 
23 |     return [ds_pub, ds_pii]


--------------------------------------------------------------------------------
/tests/assets/integration_calendar/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: calendar
 4 |     version: "1.0.0"
 5 |     name: some name
 6 |     description: Enriched sports calendar events
 7 |     columns:
 8 |       - id: id
 9 |         type: long
10 |         constraints:
11 |           - type: unique
12 |           - type: not_null
13 |       - id: start_date_time
14 |         type: timestamp
15 |       - id: sport_type_name
16 |         type: string
17 |       - id: away_team
18 |         type: string
19 |       - id: home_team
20 |         type: string
21 |       - id: location
22 |         type: string
23 | 
24 |     meta:
25 |       contains_pii: true
26 |       steward: jane.doe@acme.com
27 |     storage:
28 |       type: lake
29 |       location: '/glue-job-test-destination-bucket/calendar'
30 |       options:
31 |         skip_first_row: true
32 |         partition_by:
33 |           - sport_type_name
34 |         bucketed_at: 512M
35 |         stored_as: parquet
36 |     tags:
37 |       cost_center: 123455
38 |       use_case: Customer 360
39 |     access:
40 |       domain: customer_support
41 |       confidentiality: private
42 | 


--------------------------------------------------------------------------------
/tests/assets/integration_calendar/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: calendar
 4 |   version: "1.0.0"
 5 |   owner: jane.doe@acme.com
 6 |   name: Customers
 7 |   description: All customer data
 8 |   pipeline:
 9 |     schedule: "0 3 * * *"
10 |     tasks:
11 |       - id: aggregate_events
12 |         engine: glue
13 |         logic:
14 |           module: tasks.custom_aggregate_events
15 |         inputs:
16 |           - model: events
17 |           - model: teams
18 |           - model: locations
19 | 


--------------------------------------------------------------------------------
/tests/assets/integration_file/model.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | models:
 3 |   - id: industry
 4 |     version: "1.0"
 5 |     name: industry report file
 6 |     description: a random industry report file ingested from the internet
 7 |     columns:
 8 |       - id: year
 9 |         type: integer
10 |         constraints:
11 |           - type: not_null
12 |       - id: industry_code_ANZSIC
13 |         name: Industry Code
14 |         description: the code of the industry
15 |         type: string
16 |       - id: industry_name_ANZSIC
17 |         type: string
18 |       - id: rme_size_grp
19 |         type: string
20 |       - id: variable
21 |         type: string
22 |       - id: value
23 |         type: string
24 |       - id: unit
25 |         type: string
26 |     meta:
27 |       contains_pii: true
28 |       steward: jane.doe@acme.com
29 |     tags:
30 |       cost_center: 123455
31 |       use_case: Customer 360
32 |     access:
33 |       domain: customer_support
34 |       confidentiality: private


--------------------------------------------------------------------------------
/tests/assets/integration_file/product.yml:
--------------------------------------------------------------------------------
 1 | schema_version: 1.rc-1
 2 | product:
 3 |   id: some_data_product
 4 |   owner: jane@acme.com
 5 |   description: some description is required
 6 |   defaults:
 7 |     storage:
 8 |       options:
 9 |         compression: gzip
10 |   version: "1.0.0"
11 |   engine: glue
12 |   pipeline:
13 |     schedule: "0 3 * * *"
14 |     tasks:
15 |       - id: process_some_files
16 |         logic:
17 |           module: builtin.ingest
18 |           parameters:
19 |             create_timestamp: true
20 |         inputs:
21 |           - file: s3://datalakebucket-588942721560/csvs/annual-enterprise-survey-2020-financial-year-provisional-size-bands-csv.csv
22 |             options:
23 |               type: csv
24 |               infer_schema: true
25 |               separator: ','
26 |               header: true
27 |         outputs:
28 |           - model: industry
29 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration_sport_events/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/model.yml:
--------------------------------------------------------------------------------
  1 | models:
  2 |   - id: event
  3 |     version: "1.0.0"
  4 |     description: Sport event
  5 |     columns:
  6 |       - id: id
  7 |         type: long
  8 |         constraints:
  9 |           - type: unique
 10 |           - type: not_null
 11 |       - id: sport_type_name
 12 |         type: string
 13 |       - id: home_team_id
 14 |         type: integer
 15 |       - id: away_team_id
 16 |         type: integer
 17 |       - id: location_id
 18 |         type: short
 19 |       - id: start_date_time
 20 |         type: timestamp
 21 |       - id: start_date
 22 |         type: date
 23 |         transform:
 24 |           - type: skip
 25 |       - id: sold_out
 26 |         type: short
 27 |         transform:
 28 |           - type: skip
 29 | 
 30 |     meta:
 31 |       contains_pii: true
 32 |       steward: jane.doe@acme.com
 33 |     storage:
 34 |       type: lake
 35 |       options:
 36 |         skip_first_row: true
 37 |         partition_by:
 38 |           - sport_type_name
 39 |         bucketed_at: 512M
 40 |         stored_as: parquet
 41 |         location: 's3a://glue-job-test-destination-bucket/sport_event'
 42 |     tags:
 43 |       cost_center: 123455
 44 |       use_case: Customer 360
 45 |     access:
 46 |       domain: customer_support
 47 |       confidentiality: private
 48 | 
 49 |   - id: location
 50 |     version: "1.0.0"
 51 |     description: Sport Location
 52 |     columns:
 53 |       - id: id
 54 |         type: integer
 55 |         constraints:
 56 |           - type: unique
 57 |           - type: not_null
 58 |       - id: name
 59 |         type: string
 60 |       - id: city
 61 |         type: string
 62 |       - id: seating_capacity
 63 |         type: integer
 64 |       - id: levels
 65 |         type: integer
 66 |       - id: sections
 67 |         type: integer
 68 | 
 69 |     meta:
 70 |       contains_pii: true
 71 |       steward: jane.doe@acme.com
 72 |     storage:
 73 |       type: lake
 74 |       options:
 75 |         skip_first_row: true
 76 |         partition_by:
 77 |           - levels
 78 |         bucketed_at: 512M
 79 |         stored_as: parquet
 80 |         location: 's3a://glue-job-test-destination-bucket/location'
 81 |     tags:
 82 |       cost_center: 123455
 83 |       use_case: Customer 360
 84 |     access:
 85 |       domain: customer_support
 86 |       confidentiality: private
 87 | 
 88 |   - id: team
 89 |     version: "1.0.0"
 90 |     description: Sport Team
 91 |     columns:
 92 |       - id: id
 93 |         type: integer
 94 |         constraints:
 95 |           - type: unique
 96 |           - type: not_null
 97 |       - id: name
 98 |         type: string
 99 |       - id: abbreviated_name
100 |         type: string
101 |       - id: home_field_id
102 |         type: integer
103 |       - id: sport_type_name
104 |         type: string
105 |       - id: sport_league_short_name
106 |         type: string
107 |       - id: sport_division_short_name
108 |         type: string
109 | 
110 |     meta:
111 |       contains_pii: true
112 |       steward: jane.doe@acme.com
113 |     storage:
114 |       type: lake
115 |       options:
116 |         skip_first_row: true
117 |         partition_by:
118 |           - sport_league_short_name
119 |           - sport_division_short_name
120 |         bucketed_at: 512M
121 |         stored_as: parquet
122 |         location: 's3a://glue-job-test-destination-bucket/team'
123 |     tags:
124 |       cost_center: 123455
125 |       use_case: Customer 360
126 |     access:
127 |       domain: customer_support
128 |       confidentiality: private
129 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/product.yml:
--------------------------------------------------------------------------------
 1 | product:
 2 |   id: sport_events
 3 |   version: "1.0.0"
 4 |   description: Sporting Events
 5 |   pipeline:
 6 |     schedule: "0 3 * * *"
 7 |     tasks:
 8 |       - id: extract_sport_events
 9 |         logic:
10 |           module: builtin.ingest
11 |         input:
12 |           - id: events
13 |             type: connection
14 |             connection_id: test_db_connection
15 |             table: dms_sample.sporting_event
16 |             model: event
17 |       - id: extract_locations
18 |         logic:
19 |           module: builtin.ingest
20 |         input:
21 |           - id: locations
22 |             type: connection
23 |             connection_id: test_db_connection
24 |             table: dms_sample.sport_location_int
25 |             model: location
26 |       - id: extract_teams
27 |         logic:
28 |           module: builtin.ingest
29 |         input:
30 |           - id: teams
31 |             type: connection
32 |             connection_id: test_db_connection
33 |             table: dms_sample.sport_team_int
34 |             model: team
35 | 


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/assets/integration_sport_events/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/assets/integration_sport_events/tasks/custom_aggregate_events.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pyspark.sql.functions import col
 4 | 
 5 | from driver.common import find_dataset_by_id
 6 | from driver.core import DataSet
 7 | 
 8 | 
 9 | def execute(inp_dfs: List[DataSet]):
10 |     events = find_dataset_by_id(inp_dfs, 'events').df.alias('events')
11 |     teams = find_dataset_by_id(inp_dfs, 'teams').df.alias('teams')
12 |     locations = find_dataset_by_id(inp_dfs, 'locations').df.alias('locations')
13 | 
14 |     events = events.join(locations, on=events.location_id == locations.id).select('events.*', col("locations.name").alias('location'))
15 |     events = events.join(teams, on=events.home_team_id == teams.id).select('events.*', col('location'), col("teams.name").alias('home_team'))
16 |     events = events.join(teams, on=events.away_team_id == teams.id).select('events.*', col('location'), col('home_team'), col("teams.name").alias('away_team'))
17 | 
18 |     events = events.drop(col('location_id'))
19 |     events = events.drop(col('home_team_id'))
20 |     events = events.drop(col('away_team_id'))
21 | 
22 |     output_ds = DataSet(id='calendar', df=events)
23 |     return [output_ds]
24 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person
 7 |     version: "1.0"
 8 |     columns:
 9 |       - id: id
10 |         type: integer
11 |         constraints:
12 |           - type: unique
13 |           - type: not_null
14 |       - id: first_name
15 |         type: string
16 |       - id: last_name
17 |         type: string
18 |       - id: age
19 |         type: integer
20 |       - id: city
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |     meta:
25 |       contains_pii: true
26 |       steward: jane.doe@acme.com
27 |     tags:
28 |       cost_center: 123455
29 |       use_case: Customer 360
30 |     access:
31 |       domain: customer_support
32 |       confidentiality: private
33 |   - id: transaction
34 |     version: "1.0"
35 |     columns:
36 |       - id: id
37 |         type: integer
38 |         constraints:
39 |           - type: unique
40 |           - type: not_null
41 |       - id: sku
42 |         type: string
43 |       - id: trx_date
44 |         type: timestamp
45 |       - id: geo
46 |         type: string
47 |       - id: items
48 |         type: integer
49 |     meta:
50 |       contains_pii: false
51 |       steward: jane.doe@acme.com
52 |     tags:
53 |       cost_center: 123455
54 |       use_case: Customer 360
55 |     access:
56 |       domain: customer_support
57 |       confidentiality: private
58 |   - id: ratings
59 |     version: "1.0"
60 |     columns:
61 |       - id: userId
62 |         type: integer
63 |       - id: movieId
64 |         type: integer
65 |       - id: rating
66 |         type: integer
67 |       - id: timestamp
68 |         type: long
69 |     meta:
70 |       contains_pii: false
71 |       steward: jane.doe@acme.com
72 |     tags:
73 |       cost_center: 123455
74 |       use_case: Customer 360
75 |     access:
76 |       domain: customer_support
77 |       confidentiality: private
78 |   - id: movie
79 |     version: "1.0"
80 |     columns:
81 |       - id: movieId
82 |         type: integer
83 |       - id: title
84 |         type: string
85 |       - id: genres
86 |         type: string
87 |     meta:
88 |       contains_pii: false
89 |       steward: jane.doe@acme.com
90 |     tags:
91 |       cost_center: 123455
92 |       use_case: Customer 360
93 |     access:
94 |       domain: customer_support
95 |       confidentiality: private
96 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_compilation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person_pii
 7 |     version: "1.0.0"
 8 |     name: xxxxxxx
 9 |     description: A person, who can be a customer, including PII
10 |     columns:
11 |       - id: id
12 |         type: integer
13 |         constraints:
14 |           - type: unique
15 |           - type: not_null
16 |       - id: first_name
17 |         type: string
18 |       - id: last_name
19 |         type: string
20 |       - id: full_name
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |         constraints:
25 |           - type: not_null
26 |           - type: regexp
27 |             options:
28 |               value: '[Male|Female]'
29 |       - id: age
30 |         type: integer
31 |     meta:
32 |       contains_pii: true
33 |     storage:
34 |       type: lake
35 |       format: parquet
36 |       options:
37 |         skip_first_row: true
38 |         partition_by:
39 |           - gender
40 |           - age
41 |         bucketed_at: 512M
42 |     tags:
43 |       cost_center: 123455
44 |       use_case: Customer 360
45 |     access:
46 |       domain: customer_support
47 |       confidentiality: private
48 |   - id: person_pub
49 |     version: "1.0.0"
50 |     extends: person_pii
51 |     description: a dataset with anonymised and pseudoanonimised columns
52 |     columns:
53 |       - id: full_name
54 |         transform:
55 |           - type: encrypt
56 |       - id: first_name
57 |         transform:
58 |           - type: skip
59 |       - id: last_name
60 |         transform:
61 |           - type: skip
62 |       - id: age
63 |         type: string
64 |         transform:
65 |           - type: bucketize
66 |             options:
67 |               buckets:
68 |                 0: 0-19
69 |                 20: 20-39
70 |                 40: 40+
71 |     meta:
72 |       contains_pii: false
73 |     storage:
74 |       type: lake
75 |       location: 'glue-job-test-destination-bucket/person_pub'
76 |       options:
77 |         skip_first_row: true
78 |         partition_by:
79 |           - gender
80 |           - age
81 |         bucketed_at: 512M
82 |         stored_as: parquet
83 |     tags:
84 |       cost_center: 123455
85 |       use_case: Customer 360
86 |     access:
87 |       domain: customer_support
88 |       confidentiality: public
89 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_correct.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: person_pii
 7 |     version: "1.0.0"
 8 |     name: xxxxxxx
 9 |     description: straightforward model configuration
10 |     columns:
11 |       - id: id
12 |         type: integer
13 |         constraints:
14 |           - type: unique
15 |           - type: not_null
16 |       - id: first_name
17 |         type: string
18 |       - id: last_name
19 |         type: string
20 |       - id: full_name
21 |         type: string
22 |       - id: gender
23 |         type: string
24 |         constraints:
25 |           - type: not_null
26 |           - type: regexp
27 |             options:
28 |               value: '[Male|Female]'
29 |       - id: age
30 |         type: integer
31 |     meta:
32 |       contains_pii: true
33 |     storage:
34 |       type: lake
35 |       location: 'glue-job-test-destination-bucket/person_pii'
36 |       format: parquet
37 |       options:
38 |         skip_first_row: true
39 |         partition_by:
40 |           - gender
41 |           - age
42 |         bucketed_at: 512M
43 |     tags:
44 |       cost_center: 123455
45 |       use_case: Customer 360
46 |     access:
47 |       domain: customer_support
48 |       confidentiality: private
49 | 
50 |   - id: person_pub
51 |     version: "1.0.0"
52 |     extends: person_pii
53 |     description: a dataset with anonymised and pseudoanonimised columns
54 |     columns:
55 |       - id: full_name
56 |         transform:
57 |           - type: encrypt
58 |       - id: first_name
59 |         transform:
60 |           - type: skip
61 |       - id: last_name
62 |         transform:
63 |           - type: skip
64 |       - id: age
65 |         type: string
66 |         transform:
67 |           - type: bucketize
68 |             options:
69 |               buckets:
70 |                 0: 0-19
71 |                 20: 20-39
72 |                 40: 40+
73 |     meta:
74 |       contains_pii: false
75 |     storage:
76 |       type: lake
77 |       location: 'glue-job-test-destination-bucket/person_pub'
78 |       options:
79 |         skip_first_row: true
80 |         partition_by:
81 |           - gender
82 |           - age
83 |         bucketed_at: 512M
84 |         stored_as: parquet
85 |     tags:
86 |       cost_center: 123455
87 |       use_case: Customer 360
88 |     access:
89 |       domain: customer_support
90 |       confidentiality: public
91 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_remove_xtra_columns.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: movie
 7 |     version: "1.0"
 8 |     xtra_columns: raze
 9 |     validation: strict
10 |     columns:
11 |       - id: movieId
12 |         type: integer
13 |       - id: title
14 |         type: string
15 |       - id: genres
16 |         type: string
17 |     meta:
18 |       contains_pii: false
19 |       steward: jane.doe@acme.com
20 |     tags:
21 |       cost_center: 123455
22 |       use_case: Customer 360
23 |     access:
24 |       domain: customer_support
25 |       confidentiality: private
26 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/model_strict_validation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | models:
 6 |   - id: movie
 7 |     version: "1.0"
 8 |     validation: strict
 9 |     columns:
10 |       - id: movieId
11 |         type: integer
12 |       - id: title
13 |         type: string
14 |       - id: genres
15 |         type: string
16 |     meta:
17 |       contains_pii: false
18 |       steward: jane.doe@acme.com
19 |     tags:
20 |       cost_center: 123455
21 |       use_case: Customer 360
22 |     access:
23 |       domain: customer_support
24 |       confidentiality: private
25 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: some_data_product
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process_some_files
15 |         inputs:
16 |           - connection: test_connection
17 |             table: some_schema.some_table
18 |             model: transaction
19 |         outputs:
20 |           - model: transaction
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_compilation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: fixture
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process fixtures
15 |         inputs:
16 |           - model: data_product_a.person
17 |           - model: data_product_b.movies
18 |           - model: ratings
19 |         outputs:
20 |           - model: transaction
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - connection: test_db_connection
20 |             table: persons
21 |             model: person
22 |         outputs:
23 |           - model: a
24 |           - model: b
25 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_all_models.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - model: a
20 |         outputs:
21 |           - model: b
22 |           - model: c
23 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_connection_w_model.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: default_extract
14 |         inputs:
15 |           - connection: connection_name
16 |             table: db_schema.db_table_name
17 |             model: person_pub
18 |         outputs:
19 |           - model: person_pub


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_correct_missing_logic_params.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - model: a
20 |         outputs:
21 |           - model: b
22 |           - model: c
23 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_input_file.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: some_data_product
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: glue
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: process_some_files
15 |         inputs:
16 |           - file: s3://datalakebucke/some_folder/some_file
17 |           - model: person
18 |         outputs:
19 |           - model: person
20 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_missing_logic.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         inputs:
15 |           - connection: test_db_connection
16 |             table: persons
17 |             model: person
18 |         outputs:
19 |           - model: a
20 |           - model: b
21 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_wrong_engine.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   engine: error
11 |   pipeline:
12 |     schedule: "0 3 * * *"
13 |     tasks:
14 |       - id: extract_customers
15 |         logic:
16 |           module: tasks.custom_business_logic
17 |           parameters:
18 |             create_timestamp: true
19 |         inputs:
20 |           - connection: test_db_connection
21 |             table: persons
22 |             model: person
23 |         outputs:
24 |           - model: a
25 |           - model: b
26 | 


--------------------------------------------------------------------------------
/tests/assets/metafiles/product_wrong_output.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | schema_version: 1.rc-1
 5 | product:
 6 |   id: customers
 7 |   owner: jane@acme.com
 8 |   description: some description is required
 9 |   version: "1.0.0"
10 |   pipeline:
11 |     schedule: "0 3 * * *"
12 |     tasks:
13 |       - id: extract_customers
14 |         logic:
15 |           module: tasks.custom_business_logic
16 |           parameters:
17 |             create_timestamp: true
18 |         inputs:
19 |           - connection: test_db_connection
20 |             table: persons
21 |             model: person
22 |         outputs:
23 |           - model: person
24 |           - error: b
25 | 


--------------------------------------------------------------------------------
/tests/aws/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/tests/aws/test_datalake.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | from driver.aws.datalake_api import Partition
 6 | from driver.aws.resolvers import reshuffle_partitions
 7 | 
 8 | 
 9 | def test_partitions():
10 |     ps = ['gender=Female/age=20-39', 'gender=Male/age=0-19', 'gender=Female/age=40+', 'gender=Female/age=0-19',
11 |           'gender=Male/age=20-39', 'gender=Male/age=40+']
12 |     partitions = list()
13 |     for p in ps:
14 |         po = Partition(p)
15 |         partitions.append(po)
16 |     for print_p in partitions:
17 |         print(str(print_p))
18 |     part_dict = reshuffle_partitions(prefix='s3a://glue-job-test-destination-bucket/', partitions=partitions)
19 |     print(json.dumps(part_dict, indent=4))
20 | 
21 |     if not(len(partitions) > 0): raise AssertionError


--------------------------------------------------------------------------------
/tests/catalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/unified-data-operations/036b148b77997985a51247552add2ec414f32fd1/tests/catalog/__init__.py


--------------------------------------------------------------------------------
/tests/catalog/test_catalog.py:
--------------------------------------------------------------------------------
 1 | from boto3.session import Session
 2 | from pyspark.sql import DataFrame
 3 | from pytest import fixture
 4 | from pyspark.sql.types import (
 5 |     StringType,
 6 |     StructField,
 7 |     StructType,
 8 |     IntegerType,
 9 | )
10 | from driver.task_executor import DataSet
11 | from driver import ConfigContainer
12 | from unittest import skip
13 | 
14 | 
15 | @fixture
16 | def person_frame(spark_session) -> DataFrame:
17 |     return spark_session.createDataFrame(
18 |         [
19 |             (1, "Joe", "Average", 22),
20 |             (2, "Max", "Mustermann", 45),
21 |         ],
22 |         StructType(
23 |             [
24 |                 StructField("id", IntegerType(), True),
25 |                 StructField("first_name", StringType(), True),
26 |                 StructField("last_name", StringType(), True),
27 |                 StructField("age", IntegerType(), True)
28 |             ]
29 |         ),
30 |     )
31 | 
32 | 
33 | @skip("Integration test is skipped for now")
34 | def test_update(person_frame: DataFrame):
35 |     catalog_service = CatalogService(Session(profile_name='finn'))
36 | 
37 |     catalog_service.drain_database('customers')
38 | 
39 |     catalog_service.update_database('customers', 'person', DataSet(
40 |         id='person',
41 |         df=person_frame,
42 |         product_id='customers',
43 |         model_id='person',
44 |         model=ConfigContainer(
45 |             storage=ConfigContainer(
46 |                 options=ConfigContainer(
47 |                     location='s3://job-interpreter/data/customers'
48 |                 )
49 |             )
50 |         )
51 |     ))
52 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import datetime
  5 | import os
  6 | from driver import ConfigContainer
  7 | 
  8 | from pyspark.sql import DataFrame
  9 | from pytest import fixture
 10 | from pyspark.sql.types import (
 11 |     StringType,
 12 |     StructField,
 13 |     StructType,
 14 |     IntegerType,
 15 |     LongType,
 16 |     DoubleType, TimestampType
 17 | )
 18 | 
 19 | from driver.util import compile_product, compile_models
 20 | 
 21 | DEFAULT_BUCKET = 's3://test-bucket'
 22 | 
 23 | 
 24 | @fixture(scope='module')
 25 | def fixture_asset_path():
 26 |     cwd_path = os.path.dirname(os.path.abspath(__file__))
 27 |     return os.path.join(cwd_path, 'assets', 'metafiles')
 28 | 
 29 | 
 30 | @fixture(scope='module')
 31 | def app_args() -> ConfigContainer:
 32 |     args = ConfigContainer()
 33 |     setattr(args, 'default_data_lake_bucket', DEFAULT_BUCKET)
 34 |     return args
 35 | 
 36 | @fixture(scope='module')
 37 | def movie_schema() -> StructType:
 38 |     return StructType([
 39 |         StructField('movieId', IntegerType(), True),
 40 |         StructField('title', StringType(), True),
 41 |         StructField('genres', StringType(), True)
 42 |     ])
 43 | 
 44 | 
 45 | @fixture(scope='module')
 46 | def ratings_schema() -> StructType:
 47 |     return StructType([
 48 |         StructField('userId', IntegerType(), True),
 49 |         StructField('movieId', IntegerType(), True),
 50 |         StructField('rating', IntegerType(), True),
 51 |         StructField('timestamp', LongType(), True)
 52 |     ])
 53 | 
 54 | 
 55 | @fixture(scope='module')
 56 | def result_schema() -> StructType:
 57 |     return StructType([
 58 |         StructField('title', StringType(), True),
 59 |         StructField('weight_avg', DoubleType(), True),
 60 |         StructField('num_votes', IntegerType(), True)
 61 |     ])
 62 | 
 63 | 
 64 | @fixture(scope='module')
 65 | def movies_df(spark_session, movie_schema) -> DataFrame:
 66 |     return spark_session.createDataFrame([(1, 'Jumanji(1995)', 'Adventure | Children | Fantasy'),
 67 |                                           (2, 'Heat (1995)', 'Action|Crime|Thriller')],
 68 |                                          movie_schema)
 69 | 
 70 | 
 71 | @fixture(scope='module')
 72 | def ratings_df(spark_session, ratings_schema) -> DataFrame:
 73 |     return spark_session.createDataFrame([(1, 1, 4, 1256677221),
 74 |                                           (2, 1, 4, 1256677222),
 75 |                                           (3, 1, 1, 1256677222),
 76 |                                           (4, 2, 4, 1256677222)
 77 |                                           ], ratings_schema)
 78 | 
 79 | 
 80 | @fixture(scope='module')
 81 | def person_schema() -> StructType:
 82 |     return StructType([
 83 |         StructField('id', IntegerType(), False),
 84 |         StructField('first_name', StringType(), True),
 85 |         StructField('last_name', StringType(), True),
 86 |         StructField('age', IntegerType(), True),
 87 |         StructField('city', StringType(), True),
 88 |         StructField('gender', StringType(), True),
 89 |     ])
 90 | 
 91 | 
 92 | @fixture(scope='module')
 93 | def person_df(spark_session, person_schema) -> DataFrame:
 94 |     return spark_session.createDataFrame([(1, "John", "Doe", 25, "Berlin", "Male"),
 95 |                                           (2, "Jane", "Doe", 41, "Berlin", "Female"),
 96 |                                           (3, "Maxx", "Mustermann", 30, "Berlin", "Male")
 97 |                                           ], person_schema)
 98 | 
 99 | 
100 | @fixture(scope='module')
101 | def transaction_schema() -> StructType:
102 |     return StructType([
103 |         StructField('id', IntegerType(), False),
104 |         StructField('sku', StringType(), True),
105 |         StructField('trx_date', TimestampType(), True),
106 |         StructField('geo', StringType(), True),
107 |         StructField('items', IntegerType(), True)
108 |     ])
109 | 
110 | 
111 | @fixture(scope='module')
112 | def transaction_df(spark_session, transaction_schema) -> DataFrame:
113 |     date_field = datetime.datetime.now()
114 |     return spark_session.createDataFrame([(1, "1234", date_field, "EMEA", 25),
115 |                                           (2, "1235", date_field, "EMEA", 41),
116 |                                           (3, "1236", date_field, "US", 30)
117 |                                           ], transaction_schema)
118 | 
119 | 
120 | @fixture(scope='module')
121 | def product(app_args, fixture_asset_path):
122 |     return compile_product(fixture_asset_path, app_args)
123 | 
124 | 
125 | @fixture(scope='module')
126 | def models(app_args, fixture_asset_path, product):
127 |     return compile_models(fixture_asset_path, product)
128 | 


--------------------------------------------------------------------------------
/tests/test_constraint_checkers.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import datetime
 5 | from driver import ConfigContainer
 6 | 
 7 | import pytest
 8 | from pyspark.sql.functions import lit, col
 9 | 
10 | from driver.core import ValidationException
11 | from driver.processors import past_validator, future_validator, unique_validator, regexp_validator, null_validator, \
12 |     freshness_validator
13 | 
14 | 
15 | def test_past_validator(spark_session, transaction_df):
16 |     past_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
17 |     past_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5))
18 |     past_validator(transaction_df, 'trx_date', ConfigContainer())
19 |     past_validator(transaction_df, 'trx_date')
20 |     with pytest.raises(ValidationException) as vex:
21 |         updf = transaction_df.withColumn('trx_date', lit(datetime.datetime.now() + datetime.timedelta(days=5)))
22 |         updf.show()
23 |         past_validator(updf, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
24 | 
25 | 
26 | def test_future_validator(spark_session, transaction_df):
27 |     transaction_df.show()
28 |     future_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5, time_unit='hours'))
29 |     future_validator(transaction_df, 'trx_date', ConfigContainer(threshold=5))
30 |     with pytest.raises(ValidationException):
31 |         future_validator(transaction_df, 'trx_date', ConfigContainer())
32 |     with pytest.raises(ValidationException):
33 |         future_validator(transaction_df, 'trx_date')
34 |     updf = transaction_df.withColumn('trx_date', lit(datetime.datetime.now() + datetime.timedelta(days=5)))
35 |     updf.show()
36 |     future_validator(updf, 'trx_date')
37 | 
38 | 
39 | def test_unique_validator(spark_session, transaction_df, transaction_schema):
40 |     unique_validator(transaction_df, 'sku')
41 |     with pytest.raises(ValidationException):
42 |         new_row = spark_session.createDataFrame([(4, "1236", datetime.datetime.now(), "US", 30)], transaction_schema)
43 |         appended = transaction_df.union(new_row)
44 |         appended.show()
45 |         unique_validator(appended, 'sku')
46 | 
47 | 
48 | def test_regexp_validator(spark_session, transaction_df, transaction_schema):
49 |     regexp_validator(transaction_df, 'geo', ConfigContainer(value='^EMEA|US$'))
50 |     with pytest.raises(ValidationException):
51 |         new_row = spark_session.createDataFrame([(4, "1237", datetime.datetime.now(), "APJ", 30)], transaction_schema)
52 |         appended = transaction_df.union(new_row)
53 |         regexp_validator(appended, 'geo', ConfigContainer(value='^EMEA|US$'))
54 | 
55 | 
56 | def test_null_validator(spark_session, transaction_df, transaction_schema):
57 |     null_validator(transaction_df, 'geo')
58 |     with pytest.raises(ValidationException):
59 |         new_row = spark_session.createDataFrame([(4, "1237", datetime.datetime.now(), None, 30)], transaction_schema)
60 |         appended = transaction_df.union(new_row)
61 |         null_validator(appended, 'geo')
62 | 
63 | 
64 | def test_freshness_validator(spark_session, transaction_df, transaction_schema):
65 |     freshness_validator(transaction_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes'))
66 |     with pytest.raises(ValidationException):
67 |         trx_date = datetime.datetime.now() - datetime.timedelta(minutes=10)
68 |         upd_df = transaction_df.withColumn("trx_date", lit(trx_date))
69 |         freshness_validator(upd_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes'))
70 | 
71 |     freshness_validator(transaction_df, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes', group_by='geo'))
72 | 
73 |     with pytest.raises(ValidationException):
74 |         trx_date = datetime.datetime.now() - datetime.timedelta(minutes=10)
75 |         new_row = spark_session.createDataFrame([(4, "1237", trx_date, "APJ", 30)], transaction_schema)
76 |         appended = transaction_df.union(new_row)
77 |         freshness_validator(appended, 'trx_date', ConfigContainer(threshold=1, time_unit='minutes', group_by='geo'))
78 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | def test_resolve_data_set_id():
5 |     pass
6 | 


--------------------------------------------------------------------------------
/tests/test_df_schema_validator.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import pytest
 5 | from time import time
 6 | from datetime import datetime
 7 | from pyspark.sql import DataFrame
 8 | from pyspark.sql.functions import col, unix_timestamp, lit
 9 | from driver.core import DataSet, DataProduct, SchemaValidationException
10 | from driver.processors import schema_checker, razor
11 | from driver.util import compile_models, filter_list_by_id
12 | 
13 | 
14 | def test_df_schema_validator(movies_df: DataFrame, product, models):
15 |     movie_model = filter_list_by_id(models, 'movie')
16 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
17 |     ds = DataSet(id='movie', df=movies_df, model=movie_model, product=dp)
18 |     ds = schema_checker(ds)
19 | 
20 | 
21 | def test_df_schema_validator_missing_fields(movies_df: DataFrame, product, models):
22 |     movie_model = filter_list_by_id(models, 'movie')
23 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
24 |     ds = DataSet(id='movie', df=movies_df.drop(col('genres')), model=movie_model, product=dp)
25 |     with pytest.raises(SchemaValidationException):
26 |         ds = schema_checker(ds)
27 | 
28 | 
29 | def test_df_schema_validator_extra_fields_lazy(movies_df: DataFrame, product, models):
30 |     movie_model = filter_list_by_id(models, 'movie')
31 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
32 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
33 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
34 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
35 |     ds = schema_checker(ds)
36 |     df.show()
37 | 
38 | 
39 | def test_df_schema_validator_extra_fields_strict(movies_df: DataFrame, product, fixture_asset_path):
40 |     models = compile_models(fixture_asset_path, product, def_file_name='model_strict_validation.yml')
41 |     movie_model = filter_list_by_id(models, 'movie')
42 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
43 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
44 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
45 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
46 |     df.show()
47 |     with pytest.raises(SchemaValidationException) as exc:
48 |         ds = schema_checker(ds)
49 | 
50 | 
51 | def test_df_schema_validator_extra_fields_strict_with_razor(movies_df: DataFrame, product, fixture_asset_path):
52 |     models = compile_models(fixture_asset_path, product, def_file_name='model_remove_xtra_columns.yml')
53 |     movie_model = filter_list_by_id(models, 'movie')
54 |     dp = DataProduct(id=product.id, description=product.description, owner=product.owner)
55 |     timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
56 |     df = movies_df.withColumn('time', unix_timestamp(lit(timestamp), 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))
57 |     ds = DataSet(id='movie', df=df, model=movie_model, product=dp)
58 |     razor(ds)
59 |     ds.df.show()
60 |     ds = schema_checker(ds)
61 | 


--------------------------------------------------------------------------------
/tests/test_model_compilation.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import os
  5 | 
  6 | import pytest
  7 | from jsonschema import ValidationError
  8 | 
  9 | from driver import util
 10 | from driver.core import ArtefactType
 11 | from driver.util import compile_product, compile_models, filter_list_by_id
 12 | from tests.conftest import DEFAULT_BUCKET
 13 | 
 14 | 
 15 | # @pytest.fixture
 16 | # def metadata_path():
 17 | #     cwd_path = os.path.dirname(os.path.abspath(__file__))
 18 | #     return os.path.join(cwd_path, 'assets', 'model_defs')
 19 | 
 20 | 
 21 | def test_basic_model_compilation(fixture_asset_path, app_args):
 22 |     product = compile_product(fixture_asset_path, app_args)
 23 |     models = compile_models(fixture_asset_path, product)
 24 |     assert product.engine == 'glue'
 25 |     assert product.id == 'some_data_product'
 26 |     assert product.owner == 'jane@acme.com'
 27 |     assert product.version == '1.0.0'
 28 |     assert product.defaults.storage.location == DEFAULT_BUCKET
 29 |     assert getattr(product, 'pipeline')
 30 |     assert getattr(product.pipeline, 'tasks')
 31 |     assert len(product.pipeline.tasks) == 1
 32 |     assert product.pipeline.tasks[0].id == 'process_some_files'
 33 |     assert len(product.pipeline.tasks[0].inputs) == 1
 34 |     assert len(product.pipeline.tasks[0].outputs) == 1
 35 |     inp = product.pipeline.tasks[0].inputs[0]
 36 |     assert hasattr(inp, 'connection')
 37 |     assert hasattr(inp, 'model')
 38 |     assert hasattr(inp, 'table')
 39 |     assert len(models) == 4
 40 | 
 41 | 
 42 | # def test_connection_with_model(metadata_path):
 43 | #     args = ConfigContainer()
 44 | #     product = compile_product(metadata_path, args, prod_def_filename='product_correct_connection_w_model.yml')
 45 | #     models = compile_models(metadata_path, product, def_file_name='model_correct.yml')
 46 | #     assert len(models) == 2
 47 | 
 48 | 
 49 | # def test_minimal_model_compilation(product, models):
 50 | #     assert models
 51 | 
 52 | 
 53 | def test_advanced_compilation_features(fixture_asset_path, app_args):
 54 |     # abs_product_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets', 'advanced_compilation')
 55 |     product = compile_product(fixture_asset_path, app_args, prod_def_filename='product_compilation.yml')
 56 |     models = compile_models(fixture_asset_path, product, def_file_name='model_compilation.yml')
 57 |     assert len(models) == 2
 58 |     person_pii = filter_list_by_id(models, 'person_pii')
 59 |     assert person_pii.storage.location == DEFAULT_BUCKET, 'The default bucket should be set on models with no explicit location'
 60 |     assert person_pii.storage.type == 'lake'
 61 |     person_pub = filter_list_by_id(models, 'person_pub')
 62 |     assert person_pub.storage.type == 'lake'
 63 |     pub_full_name_col = filter_list_by_id(person_pub.columns, 'full_name')
 64 |     assert pub_full_name_col.type == 'string', 'The String type should have been inherited from the pii model'
 65 |     assert pub_full_name_col.transform[
 66 |                0].type == 'encrypt', 'The Transform should have beein inherited from the pii model'
 67 |     pub_full_id_col = filter_list_by_id(person_pub.columns, 'id')
 68 |     assert pub_full_id_col, 'ID col should have been inherited from the pii model'
 69 |     assert pub_full_id_col.type == 'integer', 'ID col type should have been inherited from the pii model'
 70 |     gender = filter_list_by_id(person_pub.columns, 'gender')
 71 |     assert gender, 'The model should inherit the Genre column from the person pii model'
 72 | 
 73 | 
 74 | # def test_mode_extend_compilation_non_specified_field():
 75 | #     abs_product_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets')
 76 | #     args = ConfigContainer()
 77 | #     product = compile_product(abs_product_path, args)
 78 | #     models = compile_models(abs_product_path, product)
 79 | 
 80 | 
 81 | def test_model_schema_correct(fixture_asset_path):
 82 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'model_correct.yml'))
 83 |     util.validate_schema(product_def, ArtefactType.model)
 84 | 
 85 | 
 86 | def test_product_schema_correct(fixture_asset_path):
 87 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct.yml'))
 88 |     util.validate_schema(product_def, ArtefactType.product)
 89 | 
 90 | 
 91 | def test_product_schema_correct_with_models(fixture_asset_path):
 92 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct_all_models.yml'))
 93 |     util.validate_schema(product_def, ArtefactType.product)
 94 | 
 95 | 
 96 | def test_product_schema_wrong_engine(fixture_asset_path):
 97 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_wrong_engine.yml'))
 98 |     with pytest.raises(ValidationError) as vex:
 99 |         util.validate_schema(product_def, ArtefactType.product)
100 | 
101 | 
102 | def test_product_schema_output_err(fixture_asset_path):
103 |     # missing module parameters
104 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_wrong_output.yml'))
105 |     with pytest.raises(ValidationError) as vex:
106 |         util.validate_schema(product_def, ArtefactType.product)
107 | 
108 | 
109 | def test_product_missing_logic(fixture_asset_path):
110 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_missing_logic.yml'))
111 |     util.validate_schema(product_def, ArtefactType.product)
112 | 
113 |     product_def = util.load_yaml(os.path.join(fixture_asset_path, 'product_correct_missing_logic_params.yml'))
114 |     util.validate_schema(product_def, ArtefactType.product)
115 | 
116 | 
117 | def test_connection_input_configuration(fixture_asset_path):
118 |     pass
119 | 
120 | 
121 | def test_model_input_configuration(fixture_asset_path):
122 |     pass
123 | 
124 | 
125 | def test_file_input_configuration(fixture_asset_path):
126 |     # product_input_file.yml
127 |     pass
128 | 


--------------------------------------------------------------------------------
/tests/test_task_executor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import driver
 5 | from driver import ConfigContainer
 6 | from pyspark.sql import DataFrame
 7 | from driver import DataSet
 8 | from driver.processors import schema_checker, constraint_processor, transformer_processor
 9 | 
10 | 
11 | def test_end_to_end(spark_session, transaction_df: DataFrame, fixture_asset_path, app_args):
12 |     dfs = {"some_schema.some_table": transaction_df}
13 | 
14 |     def mock_input_handler(props: ConfigContainer):
15 |         return dfs.get(props.table)
16 | 
17 |     def mock_output_handler(ds: DataSet):
18 |         assert ds.id == 'transaction'
19 |         assert ds.df.count() == transaction_df.count()
20 |         ds.df.show()
21 |         ds.df.describe()
22 | 
23 |     driver.init(spark_session)
24 |     driver.register_data_source_handler('connection', mock_input_handler)
25 |     driver.register_postprocessors(transformer_processor, schema_checker, constraint_processor)
26 |     driver.register_output_handler('default', mock_output_handler)
27 |     driver.register_output_handler('lake', mock_output_handler)
28 |     setattr(app_args, 'product_path', fixture_asset_path)
29 |     print('something')
30 |     driver.process_product(app_args, fixture_asset_path)
31 | 
32 | def test_resolve_io_type():
33 |     pass
34 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | def test_enrich_models(fixture_asset_path):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/version.sh:
--------------------------------------------------------------------------------
1 | export VERSION=1.0.4


--------------------------------------------------------------------------------