├── .cfn-nag-deny-list.yml ├── .cfnlintrc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ ├── question.md │ └── support-the-sdlf.md └── workflows │ ├── static-checking.yml │ └── workshop-deployment.yml ├── .gitignore ├── .mkdocs.yml ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── deploy.sh ├── docs ├── _static │ ├── drawio │ │ ├── code-repositories-structure.drawio │ │ ├── datalake-architecture.drawio │ │ ├── sdlf-architecture-datalake.drawio │ │ ├── sdlf-architecture-datamesh.drawio │ │ ├── sdlf-dataset.drawio │ │ ├── sdlf-foundations.drawio │ │ ├── sdlf-in-a-nutshell.drawio │ │ ├── sdlf-monitoring.drawio │ │ ├── sdlf-pipeline.drawio │ │ ├── sdlf-stage-dataquality.drawio │ │ ├── sdlf-stage-glue.drawio │ │ ├── sdlf-stage-lambda.drawio │ │ └── sdlf-team.drawio │ ├── public-references.png │ ├── sail-icon.ico │ ├── sail-icon.png │ ├── sdlf-architecture-datalake.png │ ├── sdlf-architecture-datamesh.png │ ├── sdlf-cicd-gluejobsdeployer.png │ ├── sdlf-cicd.png │ ├── sdlf-dataset.png │ ├── sdlf-foundations.png │ ├── sdlf-in-a-nutshell.png │ ├── sdlf-layers-architecture.png │ ├── sdlf-logo.svg │ ├── sdlf-monitoring.png │ ├── sdlf-pipeline-full.png │ ├── sdlf-pipeline.png │ ├── sdlf-stage-dataquality.png │ ├── sdlf-stage-glue.png │ ├── sdlf-stage-lambda.png │ └── sdlf-team.png ├── architecture.md ├── constructs │ ├── cicd.md │ ├── dataset.md │ ├── foundations.md │ ├── index.md │ ├── monitoring.md │ ├── pipeline.md │ ├── stage-dataquality.md │ ├── stage-glue.md │ ├── stage-lambda.md │ └── team.md ├── index.md └── requirements.txt ├── pyproject.toml ├── sdlf-cicd ├── .gitignore ├── README.md ├── deploy-cicd.sh ├── deploy-role.sh ├── lambda │ ├── crossaccountteam-cicd │ │ └── src │ │ │ └── lambda_function.py │ ├── domain-cicd │ │ └── src │ │ │ └── lambda_function.py │ ├── parser-cicd │ │ └── src │ │ │ └── lambda_function.py │ └── stagesrepositories-cicd │ │ └── src │ │ └── lambda_function.py ├── nested-stacks │ ├── template-cicd-cfn-module.yaml │ ├── template-cicd-glue-job.yaml │ ├── template-cicd-lambda-layer.yaml │ └── template-cicd-modules-pipelines.yaml ├── sam-translate.py ├── tags.json ├── template-cfn-module.yaml ├── template-cicd-domain-roles.yaml ├── template-cicd-domain-team-role.yaml ├── template-cicd-domain.yaml ├── template-cicd-generic-git.yaml ├── template-cicd-generic-role.yaml ├── template-cicd-prerequisites.yaml ├── template-cicd-sdlf-pipelines.yaml ├── template-cicd-sdlf-repositories.codecommit.yaml ├── template-cicd-sdlf-repositories.github.yaml ├── template-cicd-sdlf-repositories.gitlab.yaml ├── template-cicd-team-pipeline.yaml ├── template-cicd-team-repository.yaml ├── template-codecommit-pr-check.yaml ├── template-generic-cfn-module.yaml ├── template-generic-cfn-template.yaml ├── template-glue-job.part ├── template-glue-job.yaml └── template-lambda-layer.yaml ├── sdlf-datalakeLibrary ├── .gitignore ├── README.md ├── buildspec.sh ├── python │ ├── __init__.py │ └── datalake_library │ │ ├── commons.py │ │ ├── data_quality │ │ └── schema_validator.py │ │ ├── datalake_exceptions.py │ │ ├── interfaces │ │ ├── __init__.py │ │ ├── dynamo_interface.py │ │ ├── s3_interface.py │ │ ├── sqs_interface.py │ │ └── states_interface.py │ │ ├── requirements.txt │ │ └── sdlf │ │ ├── __init__.py │ │ ├── __version__.py │ │ ├── config.py │ │ ├── peh.py │ │ └── utils.py └── template-lambda-layer.yaml ├── sdlf-dataset ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── dataset.py │ ├── dataset.yaml │ └── template.yaml ├── sdlf-foundations ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── foundations.py │ ├── foundations.yaml │ ├── lambda │ ├── catalog-redrive │ │ └── src │ │ │ └── lambda_function.py │ ├── catalog │ │ └── src │ │ │ └── lambda_function.py │ └── replicate │ │ └── src │ │ ├── event-create-delete-table.json │ │ ├── event-update-table.json │ │ └── lambda_function.py │ └── template.yaml ├── sdlf-monitoring ├── .gitignore ├── kibana │ ├── generic_dashboard.json │ └── generic_visualizations.json ├── lambda │ ├── cloudwatchlogs-transformer │ │ └── src │ │ │ └── lambda_function.py │ └── topic │ │ └── src │ │ └── lambda_function.py └── template.yaml ├── sdlf-pipeline ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── pipeline.py │ ├── pipeline.yaml │ └── template.yaml ├── sdlf-stage-dataquality ├── .gitignore ├── lambda │ ├── initial-check │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-redrive │ │ └── src │ │ │ └── lambda_function.py │ └── stage-routing │ │ └── src │ │ └── lambda_function.py ├── state-machine │ └── data-quality.asl.json └── template.yaml ├── sdlf-stage-ecsfargate ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── ecsfargate.py │ ├── lambda │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ ├── postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ └── routing │ │ └── src │ │ └── lambda_function.py │ ├── state-machine │ └── stage-ecsfargate.asl.json │ └── template.yaml ├── sdlf-stage-emrserverless ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── emrserverless.py │ ├── lambda │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ ├── postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ └── routing │ │ └── src │ │ └── lambda_function.py │ ├── stageemrserverless.yaml │ └── state-machine │ └── stage-emrserverless.asl.json ├── sdlf-stage-glue ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── glue.py │ ├── lambda │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ ├── postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ └── routing │ │ └── src │ │ └── lambda_function.py │ ├── stageglue.yaml │ ├── state-machine │ └── stage-glue.asl.json │ └── template.yaml ├── sdlf-stage-lambda ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── awslambda.py │ ├── lambda │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ ├── postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── process-object │ │ └── src │ │ │ └── lambda_function.py │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ └── routing │ │ └── src │ │ └── lambda_function.py │ ├── stagelambda.yaml │ ├── state-machine │ └── stage-lambda.asl.json │ └── template.yaml ├── sdlf-stageA ├── .gitignore ├── lambda │ ├── stage-a-error │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-preupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-process-object │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-redrive │ │ └── src │ │ │ └── lambda_function.py │ └── stage-a-routing │ │ └── src │ │ └── lambda_function.py ├── state-machine │ └── stage-a.asl.json └── template.yaml ├── sdlf-stageB ├── .gitignore ├── lambda │ ├── stage-b-error │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-b-fetch-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-b-postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-b-redrive │ │ └── src │ │ │ └── lambda_function.py │ └── stage-b-routing │ │ └── src │ │ └── lambda_function.py ├── state-machine │ └── stage-b.asl.json └── template.yaml ├── sdlf-team ├── .gitignore ├── README.md ├── buildspec.sh ├── pyproject.toml └── src │ ├── __init__.py │ ├── lambda │ ├── datasets-dynamodb │ │ └── src │ │ │ └── lambda_function.py │ └── pipelines-dynamodb │ │ └── src │ │ └── lambda_function.py │ ├── team.py │ ├── team.yaml │ └── template.yaml ├── sdlf-utils └── workshop-examples │ ├── 10-demo │ └── sdlf-workshop │ │ ├── dataset-legislators.yaml │ │ ├── foundations-datalake-dev.yaml │ │ ├── pipeline-main.yaml │ │ ├── tags.json │ │ └── team-datalake-engineering-dev.yaml │ ├── 10-deployment │ ├── sdlf-main-datalake-engineering │ │ ├── datasets.yaml │ │ ├── pipeline-main.yaml │ │ ├── pipelines.yaml │ │ └── tags.json │ └── sdlf-main │ │ ├── datadomain-datalake-dev.yaml │ │ ├── foundations-datalake-dev.yaml │ │ ├── tags.json │ │ └── team-datalake-engineering-dev.yaml │ ├── 20-production │ ├── sdlf-main-proserve-iot │ │ ├── datasets.yaml │ │ ├── pipeline-main.yaml │ │ ├── pipeline-singlestage.yaml │ │ ├── pipelines.yaml │ │ └── tags.json │ └── sdlf-main │ │ ├── datadomain-marketing-dev.yaml │ │ ├── datadomain-proserve-dev.yaml │ │ ├── foundations-marketing-dev.yaml │ │ ├── foundations-proserve-dev.yaml │ │ ├── tags.json │ │ ├── team-marketing-industry-dev.yaml │ │ └── team-proserve-iot-dev.yaml │ ├── clean-up.sh │ └── legislators │ ├── data │ ├── memberships.json │ ├── organizations.json │ ├── persons.json │ └── regions.json │ ├── deploy.sh │ └── scripts │ ├── legislators-glue-job.py │ └── legislators-glue-job.yaml └── validate.sh /.cfn-nag-deny-list.yml: -------------------------------------------------------------------------------- 1 | RulesToSuppress: 2 | - id: W76 3 | reason: too experimental. https://stelligent.com/2020/03/27/thought-experiment-proposed-complexity-metric-for-iam-policy-documents/ 4 | - id: W89 5 | reason: SDLF does not support running in VPC by default 6 | - id: W92 7 | reason: ReservedConcurrentExecutions 8 | -------------------------------------------------------------------------------- /.cfnlintrc: -------------------------------------------------------------------------------- 1 | ignore_templates: 2 | - "sdlf-utils/workshop-examples/10-demo/sdlf-workshop/*.yaml" 3 | include_checks: 4 | - I 5 | ignore_checks: 6 | - W3002 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **SDLF release (if known):** 27 | E.g. 1.5.2 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask us a question 4 | title: '' 5 | labels: question 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | Please be as specific as possible 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/support-the-sdlf.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Support the SDLF 3 | about: Add your organisation's name or logo to the SDLF GitHub read.me 4 | title: "[Support the SDLF]: " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Thank you for letting us use your organisation's name on the SDLF read.me page and letting other customers know that you support the project! If you would like us to also display your organisation's logo. please raise a linked pull request to provide an image file for the logo. 11 | 12 | Please add any files to *docs/source/_static/* 13 | 14 | Organisation Name: 15 | Your Name: 16 | Your Position: 17 | I have included a logo: y/n 18 | 19 | *By raising a Support the SDLF issue (and related pull request), you are granting AWS permission to use your company’s name (and logo) for the limited purpose described here and you are confirming that you have authority to grant such permission.* 20 | -------------------------------------------------------------------------------- /.github/workflows/static-checking.yml: -------------------------------------------------------------------------------- 1 | name: Static Checking 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | cfn: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 3.12 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.12 20 | - name: Set up Ruby 3.2 21 | uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: 3.2 24 | - name: install requirements 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install "cfn-lint<1" 28 | gem install cfn-nag 29 | - name: cfn-lint 30 | run: | 31 | shopt -s globstar 32 | cfn-lint ./**/*.yaml 33 | - name: cfn-nag 34 | run: | 35 | cat <> .cfn-nag-deny-list.yml 36 | - id: W61 37 | reason: |- 38 | Certificates are handled by customers downstream, see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-encryption-enable.html#emr-encryption-certificates 39 | This is ignored only during CI as we want customers to be aware they need to update the security configuration should they choose to use it. 40 | EOT 41 | find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \ 42 | | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path 43 | python: 44 | runs-on: ubuntu-latest 45 | steps: 46 | - uses: actions/checkout@v4 47 | - name: Set up Python 3.12 48 | uses: actions/setup-python@v5 49 | with: 50 | python-version: 3.12 51 | - name: install requirements 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install ruff 55 | - name: ruff format 56 | run: ruff format --check . 57 | - name: ruff 58 | run: ruff check --output-format github . 59 | shellcheck: 60 | runs-on: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v4 63 | - name: install requirements 64 | run: | 65 | sudo apt update 66 | sudo apt install shellcheck 67 | - name: shellcheck 68 | run: | 69 | find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \ 70 | | xargs -0 shellcheck -x --format gcc 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Editors 2 | .vscode/ 3 | *.code-workspace 4 | .idea/ 5 | .devcontainer/ 6 | 7 | # Mac/OSX 8 | .DS_Store 9 | 10 | # Windows 11 | Thumbs.db 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Misc 22 | automated-deployment/ 23 | rpdk.log 24 | 25 | # Distribution / packaging 26 | output/ 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # cdk 158 | cdk.out 159 | cdk.context.json 160 | -------------------------------------------------------------------------------- /.mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AWS Serverless Data Lake Framework 2 | site_url: https://sdlf.readthedocs.io 3 | repo_url: https://github.com/awslabs/aws-serverless-data-lake-framework 4 | copyright: Amazon Web Services, Inc. All Rights Reserved. 5 | theme: 6 | name: material 7 | features: 8 | - navigation.tabs 9 | - navigation.tabs.sticky 10 | - toc.integrate 11 | - navigation.indexes 12 | - navigation.path 13 | logo: _static/sdlf-logo.svg 14 | favicon: _static/sail-icon.ico 15 | markdown_extensions: 16 | - admonition 17 | - attr_list 18 | - tables 19 | plugins: 20 | - search 21 | nav: 22 | - index.md 23 | - architecture.md 24 | - Constructs: 25 | - constructs/index.md 26 | - constructs/foundations.md 27 | - constructs/team.md 28 | - constructs/dataset.md 29 | - constructs/pipeline.md 30 | - constructs/stage-lambda.md 31 | - constructs/stage-glue.md 32 | - constructs/stage-dataquality.md 33 | - constructs/monitoring.md 34 | - constructs/cicd.md 35 | - 'Workshop': 'https://sdlf.workshop.aws/' 36 | - 'License': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/LICENSE' 37 | - 'Contributing': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/CONTRIBUTING.md' 38 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.12" 7 | 8 | mkdocs: 9 | configuration: .mkdocs.yml 10 | fail_on_warning: true 11 | 12 | python: 13 | install: 14 | - requirements: docs/requirements.txt 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-dataset.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-monitoring.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-pipeline.drawio: -------------------------------------------------------------------------------- 1 | 7VrbctowEP0aZtqHMMgyt0duucyQPpTOJHliNLawXWzLlUWAfn1XWAZsCZJ2uCZ9CdZKtrTnHO1q7VRwL1rccZL4j8ylYcWquYsK7lcsq2E34K80LDNDE6HM4PHAzUxbhlHwmypjTVlngUvTwkDBWCiCpGh0WBxTRxRshHM2Lw6bsLA4a0I8qhlGDgl161PgCl9ZUaO96bingeerqVtWM+uISD5YeZL6xGXzLRMeVHCPMyayq2jRo6HELsclu+92R+96YZzG4j03kF/93qBPX57Hj+i50ZxMn6f3N1b2lFcSzpTDarFimSPA2Sx2qXxIrYK7cz8QdJQQR/bOgXKw+SIKoYXgchKEYY+FjEM7ZjEM6qoZKBd0sXPpaA0ICImyiAq+hCHqhmY9u0NpyMIK0vmGEbumbP4WGXZTGYlSgbd+9AYouFBY/QVuWMNt1B/eSqeChIYBOF6GEbwXRaxSwdmUltAyAEjCwIuh6QBaFOxdiWUAEu2ojihwXTmNkZwifUcgA+lktAxcWMeiwtao6DyNwDASNIGf21nsiIDFqVwi49NJCJuwTA6bCUlabx1EJFYeJ24AoJTo2GLQhPeExUKFMWTlbTWTfCqEgUReRwtPBswqmad21QOWktWUDxDIjL1juByn4NN4kns0XvtTFlPFwnW73m3YZkWVBSRYsqWzkE6kUlPwK4i94arVxzXli2kKl6T+ISWGyhqrtzWNWaYNj9roSCpr6RteQOjvgO0LrXpV+GXc8SnwQASVYnOJIDIPsFcaSb8h+XAWwc/3ztMqi0mN/ujcDb5+3FiBaqhajt06ldiunzBe5PFKj92pZPRCyeBMELnpoXnTPhQ7Fi5wgw3bzBTK0bGosfS0qtFBXTifqSbjwmcei0k42FhLwG3GDJkMcysGf1IhlipKkxlsxQK/2Zxyov3AwrrYjDt0n9hUdhKEe1Tsc90yU8VpCLy/Fldy+D2hJ1GVPpFGwAbeXSnw6OfBdXDIhWs4gyBkOoS0j6Xc5pULNd9pbwu1cU6h5svUhbqvcLkcoRoi7GmFihq7s5/E8SKT30GYaBXPIe0zly2WgYhMyfgalGyfPeTiugbg/8LvgxV+tmGXnrjww6Z9CnVC9x8qv1W5l9d+nW+d4cuPh97oA9d/TVwq/9Zv5Aqv7tAJ4y5u7k6An7v8q78zJR6t/MP6SxaNjus6VecvO948Vds7qDrNqTpf5rWUf6hdFK6NznwWsWsaTtcl1BzBt4WKzynUfJnXUv6VhWqIsCcWqqF+/hTlH7aKTFh6yDjtVyvDF8RLLv9KSm4cMeRCc/NVfNW39a8FePAH -------------------------------------------------------------------------------- /docs/_static/public-references.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/public-references.png -------------------------------------------------------------------------------- /docs/_static/sail-icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sail-icon.ico -------------------------------------------------------------------------------- /docs/_static/sail-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sail-icon.png -------------------------------------------------------------------------------- /docs/_static/sdlf-architecture-datalake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-architecture-datalake.png -------------------------------------------------------------------------------- /docs/_static/sdlf-architecture-datamesh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-architecture-datamesh.png -------------------------------------------------------------------------------- /docs/_static/sdlf-cicd-gluejobsdeployer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-cicd-gluejobsdeployer.png -------------------------------------------------------------------------------- /docs/_static/sdlf-cicd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-cicd.png -------------------------------------------------------------------------------- /docs/_static/sdlf-dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-dataset.png -------------------------------------------------------------------------------- /docs/_static/sdlf-foundations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-foundations.png -------------------------------------------------------------------------------- /docs/_static/sdlf-in-a-nutshell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-in-a-nutshell.png -------------------------------------------------------------------------------- /docs/_static/sdlf-layers-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-layers-architecture.png -------------------------------------------------------------------------------- /docs/_static/sdlf-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Produced by OmniGraffle 7.17.5\n2020-11-27 16:09:02 +0000 6 | 7 | Canvas 1 8 | 9 | Layer 2 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/_static/sdlf-monitoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-monitoring.png -------------------------------------------------------------------------------- /docs/_static/sdlf-pipeline-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-pipeline-full.png -------------------------------------------------------------------------------- /docs/_static/sdlf-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-pipeline.png -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-dataquality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-dataquality.png -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-glue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-glue.png -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-lambda.png -------------------------------------------------------------------------------- /docs/_static/sdlf-team.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-team.png -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | SDLF supports both a centralized datalake deployment pattern and decentralized data domains which could be used as a basis for a [data mesh](https://aws.amazon.com/what-is/data-mesh/) deployment pattern. 4 | 5 | ## Centralized Data Lake 6 | 7 | ![Centralized Data Lake Architecture](_static/sdlf-architecture-datalake.png) 8 | 9 | !!! warning 10 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 11 | 12 | ## Data Mesh 13 | 14 | The Data Mesh pattern is fundamentally about decentralized data ownership, with data owned by specialized domain teams rather than a centralized data team. This usually means: 15 | 16 | - each data domain team has its own dedicated data infrastructure, for production and/or consumption 17 | - each data domain team is able to deploy the tools and infrastructure it needs - a self-serve data platform 18 | 19 | A governance layer is federating data assets in a business catalog to ensure compliance against policies and standards, and ease of data sharing across teams. 20 | 21 | As such, it can be seen as a collection of data domain-specific datalakes deployed with SDLF. [Amazon SageMaker Data and AI Governance](https://aws.amazon.com/sagemaker/data-ai-governance/) (built on Amazon DataZone) can be used for the governance layer. 22 | 23 | ![Data Mesh Architecture](_static/sdlf-architecture-datamesh.png) 24 | 25 | !!! warning 26 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 27 | 28 | ## Transactional Data Lake 29 | 30 | Using [Iceberg](https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/introduction.html). 31 | 32 | !!! warning 33 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 34 | -------------------------------------------------------------------------------- /docs/constructs/dataset.md: -------------------------------------------------------------------------------- 1 | # sdlf-dataset 2 | 3 | !!! note 4 | `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Dataset](../_static/sdlf-dataset.png) 9 | 10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset. 11 | 12 | Examples of datasets are: 13 | 14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables) 15 | - A group of files from a data source (e.g. XML files from a Telemetry system) 16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3) 17 | 18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler. 19 | 20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 21 | 22 | ## Usage 23 | 24 | ### CloudFormation with [sdlf-cicd](cicd.md) 25 | 26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 27 | 28 | ``` 29 | rExample: 30 | Type: awslabs::sdlf::dataset::MODULE 31 | Properties: 32 | pPipelineReference: !Ref pPipelineReference 33 | pTeamName: iot 34 | pDatasetName: legislators 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Datasets/{team}/{dataset}` | Dataset-specific metadata for data pipelines | | 44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler | | 45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog" | | 46 | -------------------------------------------------------------------------------- /docs/constructs/monitoring.md: -------------------------------------------------------------------------------- 1 | # sdlf-monitoring 2 | 3 | !!! note 4 | `sdlf-monitoring` is defined in the [sdlf-monitoring](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-monitoring) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Monitoring](../_static/sdlf-monitoring.png) 9 | 10 | CloudTrail (Auditing) and S3 Storage Lens are resources implemented in the framework. They are deployed once only and are consumed by all systems and users across the lake. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rProserveMonitoring: 20 | Type: awslabs::sdlf::monitoring::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pCloudtrailEnabled: true 24 | ``` 25 | 26 | ## Interface 27 | 28 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-monitoring` publishes the following parameters: 29 | 30 | | SSM Parameter | Description | Comment | 31 | | ------------------------------------------ | ---------------------------------------------------------------- | -------------------------------------------- | 32 | | `/SDLF/S3/CloudTrailBucket` | Name of CloudTrail S3 bucket | | 33 | 34 | -------------------------------------------------------------------------------- /docs/constructs/stage-dataquality.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-dataquality 2 | 3 | !!! note 4 | `sdlf-stage-dataquality` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stage-dataquality) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Data Quality](../_static/sdlf-stage-dataquality.png) 9 | 10 | Create a Glue Data Quality ruleset from recommendations then apply this ruleset to a given Glue table. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainDq: 20 | Type: proserve::iot::dataquality::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pStageName: DQ 24 | pPipeline: main 25 | pTeamName: iot 26 | pTriggerType: event 27 | pEventPattern: !Sub >- 28 | { 29 | "source": ["aws.states"], 30 | "detail-type": ["Step Functions Execution Status Change"], 31 | "detail": { 32 | "status": ["SUCCEEDED"], 33 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-b"] 34 | } 35 | } 36 | pEnableTracing: false 37 | ``` 38 | 39 | ## Interface 40 | 41 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-dataquality` publishes the following parameters: 42 | 43 | | SSM Parameter | Description | Comment | 44 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 45 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Function | | 46 | -------------------------------------------------------------------------------- /docs/constructs/stage-glue.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-glue (sdlf-stageB) 2 | 3 | !!! note 4 | `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png) 9 | 10 | Run a Glue job. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainB: 20 | Type: awslabs::sdlf::stageB::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}" 24 | pStageName: B 25 | pPipeline: main 26 | pTeamName: iot 27 | pTriggerType: schedule 28 | pEventPattern: !Sub >- 29 | { 30 | "source": ["aws.states"], 31 | "detail-type": ["Step Functions Execution Status Change"], 32 | "detail": { 33 | "status": ["SUCCEEDED"], 34 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 35 | } 36 | } 37 | pSchedule: "cron(*/5 * * * ? *)" 38 | pEnableTracing: false 39 | ``` 40 | 41 | ## Interface 42 | 43 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters: 44 | 45 | | SSM Parameter | Description | Comment | 46 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 47 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Name of the DynamoDB used to store mappings to transformation | | 48 | -------------------------------------------------------------------------------- /docs/constructs/stage-lambda.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-lambda (sdlf-stageA) 2 | 3 | !!! note 4 | `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png) 9 | 10 | Run a Lambda function. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainA: 20 | Type: awslabs::sdlf::stageA::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pStageName: A 24 | pPipeline: main 25 | pTeamName: iot 26 | pTriggerType: event 27 | pEventPattern: >- 28 | { 29 | "source": ["aws.s3"], 30 | "detail-type": ["Object Created"], 31 | "detail": { 32 | "bucket": { 33 | "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"] 34 | }, 35 | "object": { 36 | "key": [{ "prefix": "iot/legislators/" }] 37 | } 38 | } 39 | } 40 | pEnableTracing: false 41 | ``` 42 | 43 | ## Interface 44 | 45 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters: 46 | 47 | | SSM Parameter | Description | Comment | 48 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 49 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda | | 50 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Functions | | 51 | -------------------------------------------------------------------------------- /docs/constructs/team.md: -------------------------------------------------------------------------------- 1 | # sdlf-team 2 | 3 | !!! note 4 | `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"} 9 | 10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies. 11 | 12 | As such `sdlf-team` is mostly about permissions. 13 | 14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`. 15 | 16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 17 | 18 | !!! warning 19 | The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members. 20 | 21 | ## Usage 22 | 23 | ### CloudFormation with [sdlf-cicd](cicd.md) 24 | 25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 26 | 27 | ``` 28 | rExample: 29 | Type: awslabs::sdlf::team::MODULE 30 | Properties: 31 | pPipelineReference: !Ref pPipelineReference 32 | pTeamName: industry 33 | pEnvironment: dev 34 | pSNSNotificationsEmail: nobody@amazon.com 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Athena/{team}/WorkgroupName` | Team Athena workgroup name | | 44 | | `/SDLF/EventBridge/{team}/EventBusName` | Name of the team dedicated event bus | | 45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName` | Name of the team dedicated schedule group | | 46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name | | 47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn` | IAM Role ARN for Glue crawlers | | 48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary` | ARN of the permissions boundary IAM Managed policy for the team | | 49 | | `/SDLF/KMS/${pTeamName}/DataKeyId` | ARN of the team KMS data key | | 50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId` | ARN of the team KMS infrastructure key | | 51 | | `/SDLF/SNS/${pTeamName}/Notifications` | ARN of the team-specific SNS Topic | | 52 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # AWS Serverless Data Lake Framework 2 | 3 | Serverless Data Lake Framework (SDLF) is a collection of reusable artifacts aimed at accelerating the delivery of enterprise data lakes on AWS, shortening the deployment time to production from several months to a few weeks. It can be used by AWS teams, partners and customers to implement the foundational structure of a data lake following best practices. 4 | 5 | ## Motivation 6 | 7 | A data lake gives your organization agility. It provides a repository where consumers can quickly find the data they need and use it in their business projects. However, building a data lake can be complex; there's a lot to think about beyond the storage of files. For example, how do you catalog the data so you know what you've stored? What ingestion pipelines do you need? How do you manage data quality? How do you keep the code for your transformations under source control? How do you manage development, test and production environments? Building a solution that addresses these use cases can take many weeks and this time can be better spent innovating with data and achieving business goals. 8 | 9 | SDLF is a collection of production-hardened, best-practices templates which accelerate your data lake implementation journey on AWS, so that you can focus on use cases that generate value for business. 10 | 11 | ## Major Features 12 | 13 | At a high level, SDLF is an infrastructure-as-code framework that enables customers to create: 14 | 15 | - End-to-end data architectures such as a centralized (transactional) data lake or a data mesh 16 | - Foundational data lake assets (e.g. Amazon S3 buckets for data storage) 17 | - Event-driven jobs that orchestrate the transformation of data, storing the output in a new location on S3 18 | - Data processing stages using AWS serverless services such as Lambda or Glue 19 | - Git-driven deployment pipelines (CICD) for the entire data infrastructure 20 | 21 | Using all SDLF features as illustrated in the [official workshop](https://sdlf.workshop.aws/) gives you: 22 | 23 | 1. **Traceability and version control**: 24 | - SDLF is entirely managed through CICD pipelines. At no point is interaction with the AWS console necessary (in fact it's discouraged). 25 | - Using version control ensures that any change to the data lake is scrutinized before it enters production. 26 | 27 | 2. **Scalability and reproducibility**: 28 | - Deploying and tearing down a customized, production-grade data lake can be done in minutes and across multiple accounts and environments. 29 | - This is in comparison to a manual approach which would be tedious, slow, prone to errors and unable to scale. 30 | 31 | 3. **Best practices**: 32 | - Best practices acquired through dozens of implementations in production are enforced in the framework. 33 | - Features such as monitoring (S3 Storage Lens, Cloudtrail), encryption (KMS), alerting (Cloudwatch alarms), data permissions (Lake Formation) and many more are baked in SDLF so you don't have to reinvent the wheel. 34 | 35 | ## Public References 36 | 37 | ![SDLF Public References](_static/public-references.png) -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.6.1 2 | mkdocs-material==9.5.36 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | extend-exclude = ["sdlf-cicd/sam-translate.py"] 3 | line-length = 120 4 | target-version = "py312" 5 | 6 | [tool.ruff.lint] 7 | extend-select = ["I", "PL", "W"] 8 | ignore = ["PLR0912", "PLR0913", "PLR0915"] 9 | fixable = ["I001", "W291"] 10 | 11 | [tool.pylint.main] 12 | py-version = "3.12" 13 | ignore-paths = ["^/sdlf-cicd/sam-translate.py"] 14 | jobs = 0 15 | 16 | [tool.pylint.format] 17 | max-line-length = 120 18 | max-module-lines = 1500 19 | 20 | [tool.pylint.logging] 21 | # The type of string formatting that logging methods do. `old` means using % 22 | # formatting, `new` is for `{}` formatting. 23 | logging-format-style = "new" 24 | 25 | # Logging modules to check that the string format arguments are in logging 26 | # function parameter format. 27 | logging-modules = ["logging", "datalake_library.commons"] 28 | 29 | [tool.pylint.similarities] 30 | min-similarity-lines = 10 31 | -------------------------------------------------------------------------------- /sdlf-cicd/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv 22 | -------------------------------------------------------------------------------- /sdlf-cicd/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-cicd/README.md -------------------------------------------------------------------------------- /sdlf-cicd/lambda/stagesrepositories-cicd/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | 9 | codecommit_endpoint_url = "https://codecommit." + os.getenv("AWS_REGION") + ".amazonaws.com" 10 | codecommit = boto3.client("codecommit", endpoint_url=codecommit_endpoint_url) 11 | codepipeline_endpoint_url = "https://codepipeline." + os.getenv("AWS_REGION") + ".amazonaws.com" 12 | codepipeline = boto3.client("codepipeline", endpoint_url=codepipeline_endpoint_url) 13 | 14 | 15 | def lambda_handler(event, context): 16 | try: 17 | sdlf_stage_repositories = [] 18 | next_token = None 19 | while True: 20 | if next_token: 21 | response = codecommit.list_repositories(nextToken=next_token) 22 | else: 23 | response = codecommit.list_repositories() 24 | repos = response["repositories"] 25 | sdlf_stage_repositories.extend( 26 | [ 27 | repo["repositoryName"] 28 | for repo in repos 29 | if repo["repositoryName"].startswith(os.getenv("STAGES_REPOSITORIES_PREFIX")) 30 | ] 31 | ) 32 | next_token = response.get("nextToken") 33 | if not next_token: 34 | break 35 | 36 | logger.info("sdlf_stage_repositories: %s", sdlf_stage_repositories) 37 | 38 | except Exception as e: 39 | message = "Function exception: " + str(e) 40 | codepipeline.put_job_failure_result( 41 | jobId=event["CodePipeline.job"]["id"], 42 | failureDetails={"message": message, "type": "JobFailed"}, 43 | ) 44 | raise 45 | 46 | codepipeline.put_job_success_result( 47 | jobId=event["CodePipeline.job"]["id"], 48 | outputVariables={ 49 | "StagesRepositories": ",".join(sdlf_stage_repositories), 50 | "StagesRepositoriesCount": ",".join(list(map(str, range(0, len(sdlf_stage_repositories))))), 51 | }, 52 | ) 53 | return "Success" 54 | -------------------------------------------------------------------------------- /sdlf-cicd/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-cicd/template-cfn-module.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pEnvironment: 9 | Description: Environment name 10 | Type: String 11 | AllowedValues: [dev, test, prod] 12 | pDomain: 13 | Description: Name of the data domain (all lowercase, no symbols or spaces) 14 | Type: String 15 | pTeamName: 16 | Description: Name of the team (all lowercase, no symbols or spaces) 17 | Type: String 18 | pModuleName: 19 | Description: Name of the module 20 | Type: String 21 | pModuleGitRef: 22 | Description: Git reference (commit id) with the sources of this module version 23 | Type: String 24 | 25 | Resources: 26 | rCloudFormationModule: 27 | Type: AWS::CloudFormation::ModuleVersion 28 | Properties: 29 | ModuleName: !Sub "${pDomain}::${pTeamName}::${pModuleName}::MODULE" 30 | ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pDomain}/${pEnvironment}/${pTeamName}/${pModuleName}-${pModuleGitRef}.zip" 31 | 32 | rCloudFormationModuleDefaultVersion: 33 | Type: AWS::CloudFormation::ModuleDefaultVersion 34 | Properties: 35 | Arn: !Ref rCloudFormationModule 36 | 37 | rCloudFormationModuleSsm: 38 | Type: AWS::SSM::Parameter 39 | DependsOn: rCloudFormationModuleDefaultVersion 40 | Properties: 41 | Name: !Sub /SDLF/CFN/${pDomain}-${pTeamName}-${pModuleName}-MODULE 42 | Type: String 43 | Value: !Ref pModuleGitRef 44 | Description: Git reference (commit id) with the sources of this module version 45 | -------------------------------------------------------------------------------- /sdlf-cicd/template-cicd-team-repository.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: CICD resources to handle deployment of a new team (repository) 3 | 4 | Parameters: 5 | pKMSKey: 6 | Description: The KMS key used by CodeBuild and CodePipeline 7 | Type: AWS::SSM::Parameter::Value 8 | Default: /SDLF/KMS/CICDKeyId 9 | pDomain: 10 | Description: Name of the data domain (all lowercase, no symbols or spaces) 11 | Type: String 12 | AllowedPattern: "[a-z0-9]{2,9}" 13 | pTeamName: 14 | Description: Team name 15 | Type: String 16 | pGitPlatform: 17 | Description: Platform used to host git repositories 18 | Type: AWS::SSM::Parameter::Value 19 | Default: /SDLF/Misc/GitPlatform 20 | pMainRepositoriesPrefix: 21 | Type: String 22 | Default: sdlf-main- 23 | 24 | Conditions: 25 | GitPlatformCodeCommit: !Equals [!Ref pGitPlatform, "CodeCommit"] 26 | GitPlatformGitLab: !Equals [!Ref pGitPlatform, "GitLab"] 27 | GitPlatformGitHub: !Equals [!Ref pGitPlatform, "GitHub"] 28 | 29 | Resources: 30 | rTeamMainCodeCommit: 31 | Type: AWS::CodeCommit::Repository 32 | Condition: GitPlatformCodeCommit 33 | Metadata: 34 | cfn-lint: 35 | config: 36 | ignore_checks: 37 | - E3002 38 | Properties: 39 | Code: 40 | BranchName: main 41 | S3: ./README.md 42 | RepositoryDescription: !Sub ${pDomain} ${pTeamName} main repository 43 | RepositoryName: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 44 | KmsKeyId: !Ref pKMSKey 45 | 46 | rTeamMainGitLab: 47 | Type: GitLab::Projects::Project 48 | Metadata: 49 | cfn-lint: 50 | config: 51 | ignore_checks: 52 | - E3001 53 | Condition: GitPlatformGitLab 54 | Properties: 55 | Name: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 56 | # Path: "{{resolve:ssm:/SDLF/${pGitPlatform}/Group}}" 57 | 58 | rTeamMainGitHub: 59 | Type: GitHub::Repositories::Repository 60 | Metadata: 61 | cfn-lint: 62 | config: 63 | ignore_checks: 64 | - E3001 65 | Condition: GitPlatformGitHub 66 | Properties: 67 | Org: !Sub "{{resolve:ssm:/SDLF/${pGitPlatform}/Group}}" 68 | Name: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 69 | Private: true 70 | Visibility: private 71 | Archived: false 72 | 73 | rTeamMainCodeCommitSsm: 74 | Type: AWS::SSM::Parameter 75 | Properties: 76 | Name: !Sub /SDLF/${pGitPlatform}/${pTeamName}/Main${pGitPlatform} 77 | Type: String 78 | Value: !If 79 | - GitPlatformCodeCommit 80 | - !GetAtt rTeamMainCodeCommit.Name 81 | - !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} # !GetAtt rTeamMainGitLab.Name 82 | Description: !Sub Name of the ${pDomain} ${pTeamName} main repository 83 | -------------------------------------------------------------------------------- /sdlf-cicd/template-generic-cfn-module.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pLibraryOrg: 9 | Description: Name of the org (all lowercase, no symbols or spaces) 10 | Type: String 11 | pLibraryFramework: 12 | Description: Name of the framework (all lowercase, no symbols or spaces) 13 | Type: String 14 | pLibraryModule: 15 | Description: Name of the module 16 | Type: String 17 | pModuleGitRef: 18 | Description: Git reference (commit id) with the sources of this module version 19 | Type: String 20 | 21 | Resources: 22 | rCloudFormationModule: 23 | Type: AWS::CloudFormation::ModuleVersion 24 | Properties: 25 | ModuleName: !Sub "${pLibraryOrg}::${pLibraryFramework}::${pLibraryModule}::MODULE" 26 | ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pLibraryOrg}/${pLibraryFramework}/${pLibraryModule}-${pModuleGitRef}.zip" 27 | 28 | rCloudFormationModuleDefaultVersion: 29 | Type: AWS::CloudFormation::ModuleDefaultVersion 30 | Properties: 31 | Arn: !Ref rCloudFormationModule 32 | 33 | rCloudFormationModuleSsm: 34 | Type: AWS::SSM::Parameter 35 | DependsOn: rCloudFormationModuleDefaultVersion 36 | Properties: 37 | Name: !Sub /SDLF/CFN/${pLibraryOrg}-${pLibraryFramework}-${pLibraryModule}-MODULE 38 | Type: String 39 | Value: !Ref pModuleGitRef 40 | Description: Git reference (commit id) with the sources of this module version -------------------------------------------------------------------------------- /sdlf-cicd/template-generic-cfn-template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pModuleName: 6 | Description: Name of the module 7 | Type: String 8 | pModuleGitRef: 9 | Description: Git reference (commit id) with the sources of this module version 10 | Type: String 11 | pModuleS3Url: 12 | Description: S3 URL (https) to the module template 13 | Type: String 14 | 15 | Resources: 16 | rCloudFormationModuleSsm: 17 | Type: AWS::SSM::Parameter 18 | Properties: 19 | Name: !Sub /sdlf/${pModuleName}/${pModuleGitRef} 20 | Type: String 21 | Value: !Ref pModuleS3Url 22 | Description: S3 URL (https) to the module template of this module version 23 | -------------------------------------------------------------------------------- /sdlf-cicd/template-glue-job.part: -------------------------------------------------------------------------------- 1 | 2 | r%{BUILDSTEPVARIABLE_NOHYPHEN_AZ}GlueConnection: 3 | Type: AWS::Glue::Connection 4 | Condition: RunInVpc 5 | Metadata: 6 | cfn-lint: 7 | config: 8 | ignore_checks: 9 | - W3010 10 | Properties: 11 | CatalogId: !Ref AWS::AccountId 12 | ConnectionInput: 13 | ConnectionProperties: {} 14 | ConnectionType: NETWORK 15 | Description: "Network connected to the VPC data source" 16 | Name: !Sub sdlf-${pTeamName}-glue-conn-%{BUILDSTEPVARIABLE_NOHYPHEN_AZ} 17 | PhysicalConnectionRequirements: 18 | AvailabilityZone: %{BUILDSTEPVARIABLE_AZ} 19 | SecurityGroupIdList: !Split [",", !ImportValue sdlf-cicd-domain-roles-vpc-security-groups] 20 | SubnetId: %{BUILDSTEPVARIABLE_SUBNET} 21 | -------------------------------------------------------------------------------- /sdlf-cicd/template-glue-job.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Transform: AWS::LanguageExtensions 3 | Description: Deploy Glue jobs 4 | 5 | Parameters: 6 | pArtifactsBucket: 7 | Description: The artifacts bucket used by CodeBuild and CodePipeline 8 | Type: AWS::SSM::Parameter::Value 9 | Default: /SDLF/S3/ArtifactsBucket 10 | pTeamName: 11 | Description: Name of the team (all lowercase, no symbols or spaces) 12 | Type: String 13 | pGlueJobs: 14 | Description: List of glue job names 15 | Type: CommaDelimitedList 16 | AllowedPattern: "^[a-zA-Z0-9\\-]*$" 17 | pEnableVpc: 18 | Description: Deploy SDLF resources in a VPC 19 | Type: AWS::SSM::Parameter::Value 20 | Default: /SDLF/VPC/Enabled 21 | 22 | Conditions: 23 | GlueJobsNotEmpty: !Not 24 | - !Equals 25 | - !Join ["", !Ref pGlueJobs] 26 | - "" 27 | RunInVpc: !Equals [!Ref pEnableVpc, true] 28 | 29 | Resources: 30 | rGlueRole: 31 | Type: AWS::IAM::Role 32 | Properties: 33 | Path: /service-role/ 34 | PermissionsBoundary: !Sub "{{resolve:ssm:/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary}}" 35 | AssumeRolePolicyDocument: 36 | Version: 2012-10-17 37 | Statement: 38 | - Effect: Allow 39 | Principal: 40 | Service: 41 | - glue.amazonaws.com 42 | Action: 43 | - sts:AssumeRole 44 | ManagedPolicyArns: 45 | - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole 46 | - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess 47 | - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess 48 | Policies: 49 | - PolicyName: !Sub sdlf-${pTeamName}-glue-job 50 | PolicyDocument: 51 | Version: 2012-10-17 52 | Statement: 53 | - Effect: Allow 54 | Action: 55 | - kms:CreateGrant 56 | - kms:Decrypt 57 | - kms:DescribeKey 58 | - kms:Encrypt 59 | - kms:GenerateDataKey* 60 | - kms:ReEncrypt* 61 | Resource: 62 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/InfraKeyId}}" 63 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/DataKeyId}}" 64 | - "{{resolve:ssm:/SDLF/KMS/KeyArn}}" 65 | 66 | "Fn::ForEach::GlueJobResources": 67 | - GlueJobName 68 | - !Ref pGlueJobs 69 | - "r&{GlueJobName}GlueJob": 70 | Type: AWS::Glue::Job 71 | Condition: GlueJobsNotEmpty 72 | Properties: 73 | Command: 74 | Name: glueetl 75 | PythonVersion: "3" 76 | ScriptLocation: !Sub s3://${pArtifactsBucket}/${pTeamName}/transforms/${GlueJobName}.py 77 | DefaultArguments: !If 78 | - RunInVpc 79 | - 80 | "--job-bookmark-option": job-bookmark-disable 81 | "--enable-glue-datacatalog": "true" 82 | "--enable-continuous-cloudwatch-log": "true" 83 | "--enable-continuous-log-filter": "true" 84 | "--enable-metrics": "true" 85 | "--disable-proxy-v2": "true" 86 | - 87 | "--job-bookmark-option": job-bookmark-disable 88 | "--enable-glue-datacatalog": "true" 89 | "--enable-continuous-cloudwatch-log": "true" 90 | "--enable-continuous-log-filter": "true" 91 | "--enable-metrics": "true" 92 | ExecutionProperty: 93 | MaxConcurrentRuns: 10 94 | MaxRetries: 0 95 | MaxCapacity: 2.0 96 | GlueVersion: "4.0" 97 | Name: !Sub 98 | - sdlf-${pTeamName}-${BaseGlueJobName} 99 | - BaseGlueJobName: !Select [0, !Split ["-", !Ref GlueJobName]] 100 | SecurityConfiguration: !Sub "{{resolve:ssm:/SDLF/Glue/${pTeamName}/SecurityConfigurationId}}" 101 | Role: !Ref rGlueRole 102 | Connections: !If 103 | - RunInVpc 104 | - Connections: 105 | - BUILDSTEPVARIABLE_GLUECONNECTIONS 106 | - !Ref "AWS::NoValue" 107 | -------------------------------------------------------------------------------- /sdlf-cicd/template-lambda-layer.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Transform: AWS::LanguageExtensions 3 | Description: Deploy Lambda Layers 4 | 5 | Parameters: 6 | pArtifactsBucket: 7 | Description: The artifacts bucket used by CodeBuild and CodePipeline 8 | Type: String 9 | pDomain: 10 | Description: Name of the data domain (all lowercase, no symbols or spaces) 11 | Type: String 12 | pEnvironment: 13 | Description: Environment name 14 | Type: String 15 | AllowedValues: [dev, test, prod] 16 | pTeamName: 17 | Description: Name of the team (all lowercase, no symbols or spaces) 18 | Type: String 19 | pLayers: 20 | Description: List of folder names from layers/ directory 21 | Type: CommaDelimitedList 22 | AllowedPattern: "^[a-zA-Z0-9]*$" 23 | pGitRef: 24 | Description: Git reference (commit id) with the sources of these layers 25 | Type: String 26 | 27 | Conditions: 28 | DatalakeLibraryLayer: !Equals [!Ref pTeamName, sdlf] 29 | 30 | Resources: 31 | ######## LAMBDA LAYERS ######## 32 | "Fn::ForEach::LambdaLayerResources": 33 | - LayerName 34 | - !Ref pLayers 35 | - "r${LayerName}LambdaLayer": 36 | Type: AWS::Lambda::LayerVersion 37 | Properties: 38 | CompatibleRuntimes: 39 | - python3.12 40 | Content: 41 | S3Bucket: !Ref pArtifactsBucket 42 | S3Key: !Sub layers/${pDomain}/${pEnvironment}/${pTeamName}/${LayerName}-${pGitRef}.zip 43 | Description: !Sub ${pTeamName} ${LayerName} Lambda Layer 44 | LayerName: 45 | !If [ 46 | DatalakeLibraryLayer, 47 | !Sub "sdlf-${LayerName}", 48 | !Sub "sdlf-${pTeamName}-${LayerName}" 49 | ] 50 | "r${LayerName}LambdaLayerSsm": 51 | Type: AWS::SSM::Parameter 52 | Properties: 53 | Name: 54 | !If [ 55 | DatalakeLibraryLayer, 56 | !Sub "/SDLF/Lambda/Latest${LayerName}Layer", 57 | !Sub "/SDLF/Lambda/${pTeamName}/Latest${LayerName}Layer" 58 | ] 59 | Type: String 60 | Value: !Ref 61 | "Fn::Sub": r${LayerName}LambdaLayer 62 | Description: !Sub The ARN of the latest version of the ${pTeamName} ${LayerName} layer 63 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/README.md: -------------------------------------------------------------------------------- 1 | # Datalake Library 2 | The data lake library repository is where a team pushes the transformation code (i.e. business logic) that they wish to apply to their datasets. After each new commit, the repository is automatically packaged into a lambda layer and mounted to the individual Lambda functions of the pipelines belonging to the team. The repository also holds helper functions that automate boiler plate code such as SQS, S3, and DynamoDB operations. 3 | 4 | ## IMPORTANT 5 | Please ensure that you follow this file structure, with a folder named `python` at the root containing all the Lambda code that should be part of the Layer. The automated build process depends on the file structure being as follows: 6 | 7 | ./ 8 | ├── README.md 9 | └── python 10 | └── datalake_library 11 | ├── configuration 12 | ├── interfaces 13 | ├── octagon 14 | ├── tests 15 | └── transforms 16 | ├── stage_a_transforms 17 | │ ├── light_transform_blueprint.py 18 | │ ├── ... 19 | ├── stage_b_transforms 20 | │ ├── heavy_transform_blueprint.py 21 | │ └── ... 22 | └── transform_handler.py 23 | 24 | ## Adding Transformations 25 | When adding custom transformations to the Lambda Layer, simply add your code to this repository (see example of `light_transform_blueprint.py` in file structure above) in the relevant location (e.g. stage_a_transforms for light transformations in StageA). Any changes to this repository should stay in branches while in development, and once tested/stable, these changes can then be merged into the relevant environment branch (`dev, test or main`). The pipeline will trigger upon commits made to this branch, and release these changes automatically. 26 | 27 | ## Pipeline 28 | The CICD pipeline for this repository is defined in the `sdlf-team` repository for each team (`nested-stacks/template-cicd.yaml`). A CodeBuild job is used to package the code in this repository into a `.zip` file, while leaving out any `__pycache__` files, which is then published as a Lambda Layer. Due to limitations on the size of packages, the code in this repository must not exceed 50MB when zipped, and no more than 250Mb unzipped. 29 | 30 | Configuration details, e.g. the name of the Lambda Layer built from this repository, will be defined in the template containing the **sdlf-pipeline** infrastructure. Some of the configuration details available for customization: 31 | 1. Through the pipeline: 32 | 1. Main Git branch to use — currently set to `dev` 33 | 2. Through the CodeBuild job: 34 | 1. Name of the resulting Layer 35 | 2. Compatible runtimes 36 | 3. SSM parameter used to store the ARN of the latest version 37 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/buildspec.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CFN_ENDPOINT="https://cloudformation.$AWS_REGION.amazonaws.com" 4 | 5 | pip uninstall -y aws-sam-cli && unzip -q aws-sam-cli-linux-x86_64.zip -d sam-installation 6 | ./sam-installation/install && sam --version 7 | pip install "cfn-lint<1" cloudformation-cli 8 | 9 | # removing everything up to the first hyphen, then anything that isn't a letter/number, and lower-casing what's left 10 | module_name_without_prefix="${SDLF_CONSTRUCT#*-}" 11 | module_name_alnum="${module_name_without_prefix//[^[:alnum:]]/}" 12 | MODULE="${module_name_alnum,,}" 13 | MODULE="DatalakeLibrary" # TODO 14 | 15 | mkdir artifacts 16 | zip -r artifacts/datalake_library.zip ./python -x \*__pycache__\* 17 | LAYER_HASH="$(sha256sum artifacts/datalake_library.zip | cut -c1-12)" 18 | aws s3api put-object --bucket "$ARTIFACTS_BUCKET" \ 19 | --key "sdlf/layers/$MODULE-$LAYER_HASH.zip" \ 20 | --body artifacts/datalake_library.zip 21 | 22 | STACK_NAME="sdlf-lambdalayers-$MODULE" 23 | aws cloudformation --endpoint-url "$CFN_ENDPOINT" deploy \ 24 | --stack-name "$STACK_NAME" \ 25 | --template-file ./template-lambda-layer.yaml \ 26 | --parameter-overrides \ 27 | pArtifactsBucket="$ARTIFACTS_BUCKET" \ 28 | pLayerName="$MODULE" \ 29 | pGitRef="$LAYER_HASH" \ 30 | --tags Framework=sdlf \ 31 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" || exit 1 32 | 33 | echo "done" 34 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-datalakeLibrary/python/__init__.py -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/commons.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional 3 | 4 | from boto3.dynamodb.types import TypeDeserializer, TypeSerializer 5 | 6 | if TYPE_CHECKING: 7 | from mypy_boto3_dynamodb.type_defs import ( 8 | AttributeValueTypeDef, 9 | ) 10 | 11 | 12 | def init_logger(file_name, log_level=None): 13 | if not log_level: 14 | log_level = "INFO" 15 | logging.basicConfig() 16 | logger = logging.getLogger(file_name) 17 | logger.setLevel(getattr(logging, log_level)) 18 | return logger 19 | 20 | 21 | def serialize_dynamodb_item( 22 | item: Mapping[str, Any], serializer: Optional[TypeSerializer] = None 23 | ) -> Dict[str, "AttributeValueTypeDef"]: 24 | serializer = serializer if serializer else TypeSerializer() 25 | return {k: serializer.serialize(v) for k, v in item.items()} 26 | 27 | 28 | def deserialize_dynamodb_item( 29 | item: Mapping[str, "AttributeValueTypeDef"], deserializer: Optional[TypeDeserializer] = None 30 | ) -> Dict[str, Any]: 31 | deserializer = deserializer if deserializer else TypeDeserializer() 32 | return {k: deserializer.deserialize(v) for k, v in item.items()} 33 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/datalake_exceptions.py: -------------------------------------------------------------------------------- 1 | class ObjectDeleteFailedException(Exception): 2 | """Raised when the lambda fails to delete a file(s)""" 3 | 4 | pass 5 | 6 | 7 | class InvalidS3PutEventException(Exception): 8 | """Raised when the object added to the bucket according to the provided event does not match the expected pattern""" 9 | 10 | pass 11 | 12 | 13 | class UnprocessedKeysException(RuntimeError): 14 | """Raised when keys are unprocessed, either because the batch limit is exceeded, the size of the response is too big 15 | (>16Mb) or the keys were throttled because of ProvisionedReads too low on ddb""" 16 | 17 | pass 18 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-datalakeLibrary/python/datalake_library/interfaces/__init__.py -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/sqs_interface.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import uuid 4 | 5 | import boto3 6 | from botocore.client import Config 7 | from botocore.exceptions import ClientError 8 | 9 | from ..commons import init_logger 10 | 11 | 12 | class SQSInterface: 13 | def __init__(self, queue_name, log_level=None, sqs_client=None): 14 | self.log_level = log_level or os.getenv("LOG_LEVEL", "INFO") 15 | self._logger = init_logger(__name__, self.log_level) 16 | sqs_endpoint_url = "https://sqs." + os.getenv("AWS_REGION") + ".amazonaws.com" 17 | session_config = Config(user_agent="awssdlf/2.10.0") 18 | self._sqs_client = sqs_client or boto3.client("sqs", endpoint_url=sqs_endpoint_url, config=session_config) 19 | 20 | self._message_queue = self._sqs_client.get_queue_url(QueueName=queue_name)["QueueUrl"] 21 | 22 | def receive_messages(self, max_num_messages=1): 23 | messages = self._sqs_client.receive_message( 24 | QueueUrl=self._message_queue, MaxNumberOfMessages=max_num_messages, WaitTimeSeconds=1 25 | )["Messages"] 26 | for message in messages: 27 | self._sqs_client.delete_message(QueueUrl=self._message_queue, ReceiptHandle=message["ReceiptHandle"]) 28 | return messages 29 | 30 | def receive_min_max_messages(self, min_items_process, max_items_process): 31 | """Gets max_items_process messages from an SQS queue. 32 | :param min_items_process: Minimum number of items to process. 33 | :param max_items_process: Maximum number of items to process. 34 | :return messages obtained 35 | """ 36 | messages = [] 37 | num_messages_queue = int( 38 | self._sqs_client.get_queue_attributes( 39 | QueueUrl=self._message_queue, AttributeNames=["ApproximateNumberOfMessages"] 40 | )["Attributes"]["ApproximateNumberOfMessages"] 41 | ) 42 | 43 | # If not enough items to process, break with no messages 44 | if (num_messages_queue == 0) or (min_items_process > num_messages_queue): 45 | self._logger.info("Not enough messages - exiting") 46 | return messages 47 | 48 | # Only pull batch sizes of max_batch_size 49 | num_messages_queue = min(num_messages_queue, max_items_process) 50 | max_batch_size = 10 51 | batch_sizes = [max_batch_size] * math.floor(num_messages_queue / max_batch_size) 52 | if num_messages_queue % max_batch_size > 0: 53 | batch_sizes += [num_messages_queue % max_batch_size] 54 | 55 | for batch_size in batch_sizes: 56 | resp_msg = self.receive_messages(max_num_messages=batch_size) 57 | try: 58 | messages.extend(message["Body"] for message in resp_msg) 59 | except KeyError: 60 | break 61 | return messages 62 | 63 | def send_message_to_fifo_queue(self, message, group_id): 64 | try: 65 | self._sqs_client.send_message( 66 | QueueUrl=self._message_queue, 67 | MessageBody=message, 68 | MessageGroupId=group_id, 69 | MessageDeduplicationId=str(uuid.uuid1()), 70 | ) 71 | except ClientError as e: 72 | self._logger.error("Received error: %s", e, exc_info=True) 73 | raise e 74 | 75 | def send_batch_messages_to_fifo_queue(self, messages, batch_size, group_id): 76 | try: 77 | chunks = [messages[x : x + batch_size] for x in range(0, len(messages), batch_size)] 78 | for chunk in chunks: 79 | entries = [] 80 | for x in chunk: 81 | entry = { 82 | "Id": str(uuid.uuid1()), 83 | "MessageBody": str(x), 84 | "MessageGroupId": group_id, 85 | "MessageDeduplicationId": str(uuid.uuid1()), 86 | } 87 | entries.append(entry) 88 | self._sqs_client.send_message_batch(QueueUrl=self._message_queue, Entries=entries) 89 | except ClientError as e: 90 | self._logger.error("Received error: %s", e, exc_info=True) 91 | raise e 92 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/states_interface.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from datetime import date, datetime 4 | 5 | import boto3 6 | from botocore.client import Config 7 | 8 | from ..commons import init_logger 9 | 10 | 11 | class StatesInterface: 12 | def __init__(self, log_level=None, states_client=None): 13 | self.log_level = log_level or os.getenv("LOG_LEVEL", "INFO") 14 | self._logger = init_logger(__name__, self.log_level) 15 | stepfunctions_endpoint_url = "https://states." + os.getenv("AWS_REGION") + ".amazonaws.com" 16 | session_config = Config(user_agent="awssdlf/2.10.0") 17 | self._states_client = states_client or boto3.client( 18 | "stepfunctions", endpoint_url=stepfunctions_endpoint_url, config=session_config 19 | ) 20 | 21 | @staticmethod 22 | def json_serial(obj): 23 | """JSON serializer for objects not serializable by default""" 24 | if isinstance(obj, (datetime, date)): 25 | return obj.isoformat() 26 | raise TypeError("Type %s not serializable" % type(obj)) 27 | 28 | def get_all_step_functions(self): 29 | self._logger.info("obtaining a list of all step functions") 30 | pages = self._states_client.get_paginator("list_state_machines").paginate() 31 | step_functions = [] 32 | for result in pages: 33 | step_functions.extend(result["stateMachines"]) 34 | return step_functions 35 | 36 | def run_state_machine(self, machine_arn, message): 37 | self._logger.info("running state machine with arn {}".format(machine_arn)) 38 | return self._states_client.start_execution( 39 | stateMachineArn=machine_arn, input=json.dumps(message, default=self.json_serial) 40 | ) 41 | 42 | def describe_state_execution(self, execution_arn): 43 | self._logger.info("describing {}".format(execution_arn)) 44 | response = self._states_client.describe_execution(executionArn=execution_arn) 45 | return response["status"] 46 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.35.25 2 | botocore==1.35.25 3 | boto3-stubs-lite[dynamodb]==1.35.25 4 | pytest==8.3.3 5 | pytest-mock==3.14.0 6 | python-dateutil==2.9.0 7 | pytest-cov==5.0.0 8 | mock==5.1.0 9 | coverage==7.6.1 10 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/sdlf/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | import logging 3 | 4 | from .__version__ import __title__, __version__ 5 | from .config import ( # noqa: F401; 6 | DynamoConfiguration, 7 | KMSConfiguration, 8 | S3Configuration, 9 | SQSConfiguration, 10 | StateMachineConfiguration, 11 | ) 12 | from .peh import PipelineExecutionHistoryAPI 13 | 14 | name = "sdlf" 15 | 16 | # Suppress boto3 logging 17 | logging.getLogger("boto3").setLevel(logging.CRITICAL) 18 | logging.getLogger("botocore").setLevel(logging.CRITICAL) 19 | logging.getLogger("s3transfer").setLevel(logging.CRITICAL) 20 | logging.getLogger("urllib3").setLevel(logging.CRITICAL) 21 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/sdlf/__version__.py: -------------------------------------------------------------------------------- 1 | __title__ = "SDLF" 2 | __version__ = "2.10.0" 3 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/sdlf/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import uuid 4 | 5 | 6 | def get_duration_sec(start_timestamp_str, end_timestamp_str): 7 | ts_format = "%Y-%m-%dT%H:%M:%S.%f%z" 8 | start_ts = datetime.datetime.strptime(start_timestamp_str, ts_format) 9 | end_ts = datetime.datetime.strptime(end_timestamp_str, ts_format) 10 | return (end_ts - start_ts).total_seconds() 11 | 12 | 13 | # datetime.datetime.now(datetime.UTC) 14 | def get_timestamp_iso(current_time=datetime.datetime.now(datetime.UTC)): 15 | return current_time.isoformat() 16 | 17 | 18 | # Return local date ISO formatted 19 | def get_local_date(local_time=datetime.datetime.now()): 20 | return local_time.strftime("%Y-%m-%d") 21 | 22 | 23 | def is_not_empty(arg): 24 | return (arg is not None) and (len(arg) != 0) 25 | 26 | 27 | def throw_if_none(arg, msg): 28 | if arg is None: 29 | raise ValueError(msg) 30 | 31 | 32 | def throw_none_or_empty(arg, msg): 33 | if (arg is None) or (len(arg) == 0): 34 | raise ValueError(msg) 35 | 36 | 37 | def validate_date(date_text): 38 | try: 39 | datetime.datetime.strptime(date_text, "%Y-%m-%d") 40 | except ValueError: 41 | raise ValueError("Incorrect date format, should be YYYY-MM-DD") 42 | 43 | 44 | def throw_if_false(condition, message): 45 | if not condition: 46 | raise ValueError(message) 47 | 48 | 49 | # Parses metrics string into a list of metric executions 50 | # E.g "Metric1#Metric2#Metric3" => ["Metric1", "Metric1#Metric2", "Metric1#Metric2#Metric3"] 51 | def parse_metrics(metrics_name): 52 | sep = "#" 53 | metric = [] 54 | arr = metrics_name.split(sep) 55 | 56 | if len(arr) != len(set(arr)): 57 | raise ValueError("Duplicated metrics are not allowed!") 58 | 59 | if sep in metrics_name: 60 | arr = metrics_name.split(sep) 61 | m = [] 62 | for item in arr: 63 | m.append(item) 64 | metric.append(sep.join(m)) 65 | else: 66 | metric.append(metrics_name) 67 | return metric 68 | 69 | 70 | def get_ttl(ttl_days, start_date=datetime.datetime.today()): 71 | """Get ttl value epoch format to insert into DDB TTL field 72 | 73 | Arguments: 74 | ttl_days {int} -- Number of days to keep the record 75 | 76 | Keyword Arguments: 77 | start_date {datetime} -- Starting timestamp (default: {datetime.datetime.today()}) 78 | 79 | Returns: 80 | int -- Value to insert into DynamoDB TTL field 81 | """ 82 | ttl_date = start_date + datetime.timedelta(days=ttl_days) 83 | expiry_ttl = int(time.mktime(ttl_date.timetuple())) 84 | return expiry_ttl 85 | 86 | 87 | def is_valid_uuid(val): 88 | try: 89 | uuid.UUID(str(val)) 90 | return True 91 | except ValueError: 92 | return False 93 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/template-lambda-layer.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy Lambda Layer 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pLayerName: 9 | Description: Name of the lambda layer 10 | Type: String 11 | AllowedPattern: "^[a-zA-Z0-9]*$" 12 | pGitRef: 13 | Description: Git reference (commit id) with the sources of these layers 14 | Type: String 15 | 16 | Resources: 17 | rDatalakeLibraryLambdaLayer: 18 | Type: AWS::Lambda::LayerVersion 19 | Properties: 20 | CompatibleRuntimes: 21 | - python3.12 22 | Content: 23 | S3Bucket: !Ref pArtifactsBucket 24 | S3Key: !Sub sdlf/layers/${pLayerName}-${pGitRef}.zip 25 | Description: !Sub ${pLayerName} Lambda Layer 26 | LayerName: !Sub "sdlf-${pLayerName}" 27 | 28 | rDatalakeLibraryLambdaLayerSsm: 29 | Type: AWS::SSM::Parameter 30 | Properties: 31 | Name: !Sub "/SDLF/Lambda/Latest${pLayerName}Layer" 32 | Type: String 33 | Value: !Ref rDatalakeLibraryLambdaLayer 34 | Description: !Sub The ARN of the latest version of the ${pLayerName} layer 35 | -------------------------------------------------------------------------------- /sdlf-dataset/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-dataset/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-dataset 2 | 3 | !!! note 4 | `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Dataset](../_static/sdlf-dataset.png) 9 | 10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset. 11 | 12 | Examples of datasets are: 13 | 14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables) 15 | - A group of files from a data source (e.g. XML files from a Telemetry system) 16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3) 17 | 18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler. 19 | 20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 21 | 22 | ## Usage 23 | 24 | ### CloudFormation with [sdlf-cicd](cicd.md) 25 | 26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 27 | 28 | ``` 29 | rExample: 30 | Type: awslabs::sdlf::dataset::MODULE 31 | Properties: 32 | pPipelineReference: !Ref pPipelineReference 33 | pTeamName: iot 34 | pDatasetName: legislators 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Datasets/{team}/{dataset}` | Dataset-specific metadata for data pipelines | | 44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler | | 45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog" | | 46 | -------------------------------------------------------------------------------- /sdlf-dataset/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.dataset" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | aws-cdk-aws-glue-alpha = "^2.159.1a0" 22 | aws-cdk-aws-scheduler-alpha = "^2.159.1a0" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /sdlf-dataset/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-dataset/src/__init__.py -------------------------------------------------------------------------------- /sdlf-foundations/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-foundations/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.foundations" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /sdlf-foundations/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-foundations/src/__init__.py -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/catalog-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | dlq_name = os.environ["DLQ"] 9 | queue_name = os.environ["QUEUE"] 10 | sqs_endpoint_url = "https://sqs." + os.getenv("AWS_REGION") + ".amazonaws.com" 11 | sqs = boto3.client("sqs", endpoint_url=sqs_endpoint_url) 12 | 13 | 14 | def lambda_handler(event, context): 15 | try: 16 | dlq_queue_url = sqs.get_queue_url(QueueName=dlq_name)["QueueUrl"] 17 | queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"] 18 | 19 | messages = sqs.receive_message(QueueUrl=dlq_queue_url, MaxNumberOfMessages=1, WaitTimeSeconds=1)["Messages"] 20 | if len(messages) == 0 or messages is None: 21 | logger.info("No messages found in {}".format(dlq_name)) 22 | return 23 | 24 | logger.info("Received {} messages".format(len(messages))) 25 | for message in messages: 26 | sqs.send_message(QueueUrl=queue_url, MessageBody=message["Body"]) 27 | sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"]) 28 | logger.info("Delete message succeeded") 29 | except Exception as e: 30 | logger.error("Fatal error", exc_info=True) 31 | raise e 32 | return 33 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/catalog/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from datetime import UTC, datetime 5 | from urllib.parse import unquote_plus 6 | 7 | import boto3 8 | from botocore.config import Config 9 | from botocore.exceptions import ClientError 10 | 11 | session_config = Config(user_agent_extra="awssdlf/2.10.0") 12 | 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) 15 | dynamodb = boto3.client("dynamodb", config=session_config) 16 | catalog_table = os.getenv("OBJECTMETADATA_TABLE") 17 | 18 | 19 | def parse_s3_event(s3_event): 20 | return { 21 | "bucket": {"S": s3_event["detail"]["bucket"]["name"]}, 22 | "key": {"S": unquote_plus(s3_event["detail"]["object"]["key"])}, 23 | "size": {"N": str(s3_event["detail"]["object"]["size"])}, 24 | "last_modified_date": {"S": s3_event["time"]}, 25 | "timestamp": {"N": str(int(round(datetime.now(UTC).timestamp() * 1000, 0)))}, 26 | } 27 | 28 | 29 | def put_item(table, item, key): 30 | try: 31 | response = dynamodb.put_item( 32 | TableName=table, 33 | Item=item, 34 | ConditionExpression=f"attribute_not_exists({key})", 35 | ) 36 | except ClientError as e: 37 | if e.response["Error"]["Code"] == "ConditionalCheckFailedException": 38 | logger.info(e.response["Error"]["Message"]) 39 | else: 40 | raise 41 | else: 42 | return response 43 | 44 | 45 | def delete_item(table, key): 46 | try: 47 | response = dynamodb.delete_item(TableName=table, Key=key) 48 | except ClientError as e: 49 | logger.error("Fatal error", exc_info=True) 50 | raise e 51 | else: 52 | return response 53 | 54 | 55 | def lambda_handler(event, context): 56 | try: 57 | logger.info(f"Received {len(event['Records'])} messages") 58 | for record in event["Records"]: 59 | logger.info("Parsing S3 Event") 60 | message = json.loads(record["body"]) 61 | operation = message["detail-type"] 62 | bucket = message["detail"]["bucket"]["name"] 63 | key = unquote_plus(message["detail"]["object"]["key"]) 64 | id = f"s3://{bucket}/{key}" 65 | 66 | logger.info(f"Performing Dynamo {operation} operation") 67 | if operation in ["Object Deleted"]: 68 | delete_item(catalog_table, {"id": id}) 69 | else: 70 | item = parse_s3_event(message) 71 | item["id"] = {"S": id} 72 | item["stage"] = {"S": bucket.split("-")[-1]} 73 | put_item(catalog_table, item, "id") 74 | except Exception as e: 75 | logger.error("Fatal error", exc_info=True) 76 | raise e 77 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/replicate/src/event-create-delete-table.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0", 3 | "id": "0000000-0000-5328-220a-21c060f6c3f4", 4 | "detail-type": "Glue Data Catalog Database State Change", 5 | "source": "aws.glue", 6 | "account": "123456789012", 7 | "time": "2019-01-16T18:08:48Z", 8 | "region": "us-east-1", 9 | "resources": [ 10 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/history", 11 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/organizations" 12 | ], 13 | "detail": { 14 | "databaseName": "forecourt_datalake_dev_engineering_legislators_db", 15 | "typeOfChange": "CreateTable", 16 | "changedTables": [ 17 | "history", 18 | "organizations" 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/replicate/src/event-update-table.json: -------------------------------------------------------------------------------- 1 | { 2 | "version":"0", 3 | "id":"1a2ac50f-11dc-111c-09f3-102e0932d2bf", 4 | "detail-type":"Glue Data Catalog Table State Change", 5 | "source":"aws.glue", 6 | "account":"123456789012", 7 | "time":"2020-07-08T12:20:19Z", 8 | "region":"us-east-1", 9 | "resources":[ 10 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/persons" 11 | ], 12 | "detail":{ 13 | "databaseName":"forecourt_datalake_dev_engineering_legislators_db", 14 | "typeOfChange":"UpdateTable", 15 | "tableName":"persons", 16 | "changedPartitions":[ 17 | ] 18 | } 19 | } -------------------------------------------------------------------------------- /sdlf-monitoring/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-monitoring/kibana/generic_dashboard.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "e4069440-fb2c-11e9-86fb-19c7ff919e3c", 4 | "_type": "dashboard", 5 | "_source": { 6 | "title": "Generic Dashboard", 7 | "hits": 0, 8 | "description": "", 9 | "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15,\"i\":\"1\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"dec27710-fb2c-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":15,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#E24D42\"},\"legendOpen\":false}},\"id\":\"26819f40-fb2d-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":42,\"w\":48,\"h\":14,\"i\":\"5\"},\"embeddableConfig\":{\"vis\":{\"params\":{\"sort\":{\"columnIndex\":2,\"direction\":\"asc\"}}}},\"id\":\"60903910-fb2f-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":0,\"y\":15,\"w\":13,\"h\":12,\"i\":\"8\"},\"embeddableConfig\":{},\"id\":\"3c879b80-fbc5-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"9\",\"gridData\":{\"x\":37,\"y\":15,\"w\":11,\"h\":12,\"i\":\"9\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"/aws/lambda/f1-dl04-dev-activemq-process-a\":\"#EA6460\"},\"legendOpen\":false}},\"id\":\"945eca40-fbc5-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"12\",\"gridData\":{\"x\":13,\"y\":15,\"w\":24,\"h\":12,\"i\":\"12\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#5195CE\"},\"legendOpen\":false}},\"id\":\"131befe0-fbd3-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"13\",\"gridData\":{\"x\":0,\"y\":27,\"w\":24,\"h\":15,\"i\":\"13\"},\"embeddableConfig\":{},\"id\":\"20f6a2e0-fbce-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"15\",\"gridData\":{\"x\":24,\"y\":27,\"w\":24,\"h\":15,\"i\":\"15\"},\"version\":\"6.3.1\",\"type\":\"visualization\",\"id\":\"6506b6d0-1048-11ea-a53e-f38a7f594614\",\"embeddableConfig\":{}}]", 10 | "optionsJSON": "{\"darkTheme\":false,\"hidePanelTitles\":false,\"useMargins\":true}", 11 | "version": 1, 12 | "timeRestore": false, 13 | "kibanaSavedObjectMeta": { 14 | "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}" 15 | } 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /sdlf-pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.pipeline" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | 12 | packages = [ 13 | { include = "**/*", from = "src", to = "sdlf" }, 14 | ] 15 | 16 | exclude = ["**/*.yaml"] 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.12" 20 | aws-cdk-lib = "^2.159.1" 21 | constructs = ">=10.0.0,<11.0.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-pipeline/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-pipeline/src/__init__.py -------------------------------------------------------------------------------- /sdlf-stage-dataquality/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/initial-check/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | 4 | import boto3 5 | from datalake_library.commons import init_logger 6 | from datalake_library.configuration.resource_configs import DynamoConfiguration 7 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 8 | 9 | logger = init_logger(__name__) 10 | 11 | dynamodb = boto3.client("dynamodb") 12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 14 | glue_endpoint_url = "https://glue." + os.getenv("AWS_REGION") + ".amazonaws.com" 15 | glue = boto3.client("glue", endpoint_url=glue_endpoint_url) 16 | 17 | 18 | def get_glue_transform_details(bucket, team, dataset, env, pipeline, stage): 19 | dynamo_config = DynamoConfiguration() 20 | dynamo_interface = DynamoInterface(dynamo_config) 21 | 22 | transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}") 23 | 24 | glue_database = ssm.get_parameter(Name=f"/SDLF/Glue/{team}/{dataset}/DataCatalog")["Parameter"]["Value"] 25 | glue_capacity = {"NumberOfWorkers": 5} 26 | wait_time = 45 27 | 28 | dataquality_tables = [] 29 | 30 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 31 | if pipeline in transform_info.get("pipeline", {}): 32 | if stage in transform_info["pipeline"][pipeline]: 33 | logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}") 34 | glue_capacity = transform_info["pipeline"][pipeline][stage].get("glue_capacity", glue_capacity) 35 | wait_time = transform_info["pipeline"][pipeline][stage].get("wait_time", wait_time) 36 | dataquality_tables = transform_info["pipeline"][pipeline][stage].get( 37 | "dataquality_tables", dataquality_tables 38 | ) 39 | 40 | return { 41 | "DatabaseName": glue_database, 42 | "wait_time": wait_time, 43 | "dataquality_tables": dataquality_tables, 44 | **glue_capacity, 45 | } 46 | 47 | 48 | def lambda_handler(event, context): 49 | """Calls custom transform developed by user 50 | 51 | Arguments: 52 | event {dict} -- Dictionary with details on previous processing step 53 | context {dict} -- Dictionary with details on Lambda context 54 | 55 | Returns: 56 | {dict} -- Dictionary with Data Quality Job details 57 | """ 58 | try: 59 | logger.info("Fetching event data from previous step") 60 | bucket = event["body"]["bucket"] 61 | team = event["body"]["team"] 62 | pipeline = event["body"]["pipeline"] 63 | stage = event["body"]["pipeline_stage"] 64 | dataset = event["body"]["dataset"] 65 | env = event["body"]["env"] 66 | 67 | # Checking if Data Quality is enabled on tables 68 | logger.info("Querying data quality enabled tables") 69 | event["body"]["glue"] = get_glue_transform_details(bucket, team, dataset, env, pipeline, stage) 70 | event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"]) 71 | logger.info(event["body"]["glue"]) 72 | 73 | map_input = [] 74 | for table in event["body"]["glue"]["dataquality_tables"]: 75 | map_item = copy.deepcopy(event) 76 | map_item["body"]["glue"]["TableName"] = table 77 | map_input.append(map_item) 78 | 79 | except Exception as e: 80 | logger.error("Fatal error", exc_info=True) 81 | raise e 82 | return {"dataquality": map_input} 83 | -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/stage-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration, StateMachineConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | from datalake_library.interfaces.states_interface import StatesInterface 8 | 9 | logger = init_logger(__name__) 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | team = os.environ["TEAM"] 15 | pipeline = os.environ["PIPELINE"] 16 | stage = os.environ["STAGE"] 17 | state_config = StateMachineConfiguration(team, pipeline, stage) 18 | sqs_config = SQSConfiguration(team, pipeline, stage) 19 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 20 | 21 | messages = dlq_interface.receive_messages(1) 22 | if not messages: 23 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 24 | return 25 | 26 | logger.info("Received {} messages".format(len(messages))) 27 | for message in messages: 28 | logger.info("Starting State Machine Execution") 29 | if isinstance(message["Body"], str): 30 | response = json.loads(message["Body"]) 31 | StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response) 32 | logger.info("Redrive message succeeded") 33 | except Exception as e: 34 | logger.error("Fatal error", exc_info=True) 35 | raise e 36 | return 37 | -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/stage-routing/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import ( 6 | S3Configuration, 7 | SQSConfiguration, 8 | StateMachineConfiguration, 9 | ) 10 | from datalake_library.interfaces.sqs_interface import SQSInterface 11 | from datalake_library.interfaces.states_interface import StatesInterface 12 | 13 | logger = init_logger(__name__) 14 | 15 | 16 | def lambda_handler(event, context): 17 | """Checks if any items need processing and triggers state machine 18 | Arguments: 19 | event {dict} -- Dictionary with details on what needs processing 20 | context {dict} -- Dictionary with details on Lambda context 21 | """ 22 | 23 | try: 24 | records = event["Records"] 25 | logger.info(f"Received {len(records)} messages") 26 | response = {} 27 | for record in records: 28 | event_body = json.loads(json.loads(record["body"])["output"])[0]["body"] 29 | logger.info(event_body) 30 | team = event_body["team"] 31 | pipeline = event_body["pipeline"] 32 | stage = os.environ["PIPELINE_STAGE"] 33 | dataset = event_body["dataset"] 34 | org = event_body["org"] 35 | domain = event_body["domain"] 36 | env = event_body["env"] 37 | stage_bucket = S3Configuration().stage_bucket 38 | 39 | response = { 40 | "statusCode": 200, 41 | "body": { 42 | "bucket": stage_bucket, 43 | "team": team, 44 | "pipeline": pipeline, 45 | "pipeline_stage": stage, 46 | "dataset": dataset, 47 | "org": org, 48 | "domain": domain, 49 | "env": env, 50 | }, 51 | } 52 | if response: 53 | logger.info("Starting State Machine Execution") 54 | state_config = StateMachineConfiguration(team, pipeline, stage) 55 | StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response) 56 | except Exception as e: 57 | # If failure send to DLQ 58 | sqs_config = SQSConfiguration(team, pipeline, stage) 59 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 60 | dlq_interface.send_message_to_fifo_queue(json.dumps(response), "failed") 61 | logger.error("Fatal error", exc_info=True) 62 | raise e 63 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-ecsfargate/README.md -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-ecsfargate" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.10.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-ecsfargate/src/__init__.py -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | 8 | logger = init_logger(__name__) 9 | team = os.environ["TEAM"] 10 | dataset = os.environ["DATASET"] 11 | pipeline = os.environ["PIPELINE"] 12 | pipeline_stage = os.environ["PIPELINE_STAGE"] 13 | org = os.environ["ORG"] 14 | domain = os.environ["DOMAIN"] 15 | env = os.environ["ENV"] 16 | 17 | 18 | def lambda_handler(event, context): 19 | try: 20 | if isinstance(event, str): 21 | event = json.loads(event) 22 | 23 | sqs_config = SQSConfiguration(team, pipeline, pipeline_stage) 24 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 25 | 26 | logger.info("Execution Failed. Sending original payload to DLQ") 27 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 28 | except Exception as e: 29 | logger.error("Fatal error", exc_info=True) 30 | raise e 31 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import octagon 4 | from datalake_library.commons import init_logger 5 | from datalake_library.octagon import peh 6 | 7 | logger = init_logger(__name__) 8 | team = os.environ["TEAM"] 9 | dataset = os.environ["DATASET"] 10 | pipeline = os.environ["PIPELINE"] 11 | pipeline_stage = os.environ["PIPELINE_STAGE"] 12 | org = os.environ["ORG"] 13 | domain = os.environ["DOMAIN"] 14 | env = os.environ["ENV"] 15 | 16 | 17 | def lambda_handler(event, context): 18 | """Updates the S3 objects metadata catalog 19 | 20 | Arguments: 21 | event {dict} -- Dictionary with details on previous processing step 22 | context {dict} -- Dictionary with details on Lambda context 23 | 24 | Returns: 25 | {dict} -- Dictionary with outcome of the process 26 | """ 27 | try: 28 | logger.info("Initializing Octagon client") 29 | component = context.function_name.split("-")[-2].title() 30 | octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build() 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | for records in event: 36 | for record in records: 37 | if "processed" not in record or not record["processed"]: 38 | partial_failure = True 39 | 40 | if not partial_failure: 41 | octagon_client.update_pipeline_execution( 42 | status="{} {} Processing".format(pipeline_stage, component), component=component 43 | ) 44 | octagon_client.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | octagon_client.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"]) 13 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 14 | messages = dlq_interface.receive_messages(1) 15 | if not messages: 16 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 17 | return 18 | 19 | logger.info("Received {} messages".format(len(messages))) 20 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 21 | for message in messages: 22 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 23 | logger.info("Redrive message succeeded") 24 | except Exception as e: 25 | logger.error("Fatal error", exc_info=True) 26 | raise e 27 | return 28 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/state-machine/stage-ecsfargate.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Simple ECS Fargate-based transform", 3 | "StartAt": "Try", 4 | "States": { 5 | "Try": { 6 | "Type": "Parallel", 7 | "Branches": [ 8 | { 9 | "StartAt": "Pass", 10 | "States": { 11 | "Pass": { 12 | "Type": "Pass", 13 | "Next": "Records", 14 | "Parameters": { 15 | "Items.$": "States.StringToJson($)" 16 | } 17 | }, 18 | "Records": { 19 | "Type": "Map", 20 | "ItemProcessor": { 21 | "ProcessorConfig": { 22 | "Mode": "DISTRIBUTED", 23 | "ExecutionType": "STANDARD" 24 | }, 25 | "StartAt": "Execute ECS Fargate Transformation", 26 | "States": { 27 | "Execute ECS Fargate Transformation": { 28 | "Type": "Task", 29 | "Resource": "arn:aws:states:::ecs:runTask.sync", 30 | "Parameters": { 31 | "LaunchType": "FARGATE", 32 | "Cluster": "$.Items[0].transform.ecsfargate_cluster", 33 | "TaskDefinition": "$.Items[0].transform.transform" 34 | }, 35 | "End": true 36 | } 37 | } 38 | }, 39 | "Next": "Post-update Catalog", 40 | "Label": "Records", 41 | "MaxConcurrency": 50, 42 | "ToleratedFailurePercentage": 100, 43 | "ItemBatcher": { 44 | "MaxItemsPerBatch": 1 45 | }, 46 | "InputPath": "$.Items" 47 | }, 48 | "Post-update Catalog": { 49 | "Type": "Task", 50 | "Resource": "arn:aws:states:::lambda:invoke", 51 | "ResultPath": null, 52 | "Parameters": { 53 | "Payload.$": "$", 54 | "FunctionName": "${lPostMetadata}:$LATEST" 55 | }, 56 | "Retry": [ 57 | { 58 | "ErrorEquals": [ 59 | "Lambda.ServiceException", 60 | "Lambda.AWSLambdaException", 61 | "Lambda.SdkClientException", 62 | "Lambda.TooManyRequestsException" 63 | ], 64 | "IntervalSeconds": 2, 65 | "MaxAttempts": 6, 66 | "BackoffRate": 2 67 | } 68 | ], 69 | "End": true 70 | } 71 | } 72 | } 73 | ], 74 | "End": true, 75 | "Catch": [ 76 | { 77 | "ErrorEquals": [ 78 | "States.ALL" 79 | ], 80 | "ResultPath": null, 81 | "Next": "Error" 82 | } 83 | ] 84 | }, 85 | "Error": { 86 | "Type": "Task", 87 | "Resource": "arn:aws:states:::lambda:invoke", 88 | "OutputPath": "$.Payload", 89 | "Parameters": { 90 | "Payload.$": "$", 91 | "FunctionName": "${lError}:$LATEST" 92 | }, 93 | "Retry": [ 94 | { 95 | "ErrorEquals": [ 96 | "Lambda.ServiceException", 97 | "Lambda.AWSLambdaException", 98 | "Lambda.SdkClientException", 99 | "Lambda.TooManyRequestsException" 100 | ], 101 | "IntervalSeconds": 2, 102 | "MaxAttempts": 6, 103 | "BackoffRate": 2 104 | } 105 | ], 106 | "Next": "Fail" 107 | }, 108 | "Fail": { 109 | "Type": "Fail" 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-emrserverless/README.md -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-emrserverless" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.10.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-emrserverless/src/__init__.py -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | 8 | logger = init_logger(__name__) 9 | team = os.environ["TEAM"] 10 | dataset = os.environ["DATASET"] 11 | pipeline = os.environ["PIPELINE"] 12 | pipeline_stage = os.environ["PIPELINE_STAGE"] 13 | org = os.environ["ORG"] 14 | domain = os.environ["DOMAIN"] 15 | env = os.environ["ENV"] 16 | 17 | 18 | def lambda_handler(event, context): 19 | try: 20 | if isinstance(event, str): 21 | event = json.loads(event) 22 | 23 | sqs_config = SQSConfiguration(team, pipeline, pipeline_stage) 24 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 25 | 26 | logger.info("Execution Failed. Sending original payload to DLQ") 27 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 28 | except Exception as e: 29 | logger.error("Fatal error", exc_info=True) 30 | raise e 31 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import octagon 4 | from datalake_library.commons import init_logger 5 | from datalake_library.octagon import peh 6 | 7 | logger = init_logger(__name__) 8 | team = os.environ["TEAM"] 9 | dataset = os.environ["DATASET"] 10 | pipeline = os.environ["PIPELINE"] 11 | pipeline_stage = os.environ["PIPELINE_STAGE"] 12 | org = os.environ["ORG"] 13 | domain = os.environ["DOMAIN"] 14 | env = os.environ["ENV"] 15 | 16 | 17 | def lambda_handler(event, context): 18 | """Updates the S3 objects metadata catalog 19 | 20 | Arguments: 21 | event {dict} -- Dictionary with details on previous processing step 22 | context {dict} -- Dictionary with details on Lambda context 23 | 24 | Returns: 25 | {dict} -- Dictionary with outcome of the process 26 | """ 27 | try: 28 | logger.info("Initializing Octagon client") 29 | component = context.function_name.split("-")[-2].title() 30 | octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build() 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | for records in event: 36 | for record in records: 37 | if "processed" not in record or not record["processed"]: 38 | partial_failure = True 39 | 40 | if not partial_failure: 41 | octagon_client.update_pipeline_execution( 42 | status="{} {} Processing".format(pipeline_stage, component), component=component 43 | ) 44 | octagon_client.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | octagon_client.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"]) 13 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 14 | messages = dlq_interface.receive_messages(1) 15 | if not messages: 16 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 17 | return 18 | 19 | logger.info("Received {} messages".format(len(messages))) 20 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 21 | for message in messages: 22 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 23 | logger.info("Redrive message succeeded") 24 | except Exception as e: 25 | logger.error("Fatal error", exc_info=True) 26 | raise e 27 | return 28 | -------------------------------------------------------------------------------- /sdlf-stage-glue/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-stage-glue/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-glue (sdlf-stageB) 2 | 3 | !!! note 4 | `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png) 9 | 10 | Run a Glue job. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainB: 20 | Type: awslabs::sdlf::stageB::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}" 24 | pStageName: B 25 | pPipeline: main 26 | pTeamName: iot 27 | pTriggerType: schedule 28 | pEventPattern: !Sub >- 29 | { 30 | "source": ["aws.states"], 31 | "detail-type": ["Step Functions Execution Status Change"], 32 | "detail": { 33 | "status": ["SUCCEEDED"], 34 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 35 | } 36 | } 37 | pSchedule: "cron(*/5 * * * ? *)" 38 | pEnableTracing: false 39 | ``` 40 | 41 | ## Interface 42 | 43 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters: 44 | 45 | | SSM Parameter | Description | Comment | 46 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 47 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Name of the DynamoDB used to store mappings to transformation | | 48 | -------------------------------------------------------------------------------- /sdlf-stage-glue/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-glue" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.10.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-glue/src/__init__.py -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | from datalake_library.sdlf import SQSConfiguration 7 | 8 | logger = init_logger(__name__) 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | if isinstance(event, str): 15 | event = json.loads(event) 16 | 17 | sqs_config = SQSConfiguration(instance=deployment_instance) 18 | sqs_interface = SQSInterface(sqs_config.stage_dlq) 19 | 20 | logger.info("Execution Failed. Sending original payload to DLQ") 21 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 22 | except Exception as e: 23 | logger.error("Fatal error", exc_info=True) 24 | raise e 25 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI 5 | 6 | logger = init_logger(__name__) 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | """Updates the S3 objects metadata catalog 14 | 15 | Arguments: 16 | event {dict} -- Dictionary with details on previous processing step 17 | context {dict} -- Dictionary with details on Lambda context 18 | 19 | Returns: 20 | {dict} -- Dictionary with outcome of the process 21 | """ 22 | try: 23 | logger.info("Initializing Octagon client") 24 | component = context.function_name.split("-")[-2].title() 25 | pipeline_execution = PipelineExecutionHistoryAPI( 26 | run_in_context="LAMBDA", 27 | region=os.getenv("AWS_REGION"), 28 | peh_table_instance=peh_table_instance, 29 | manifests_table_instance=manifests_table_instance, 30 | ) 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | pipeline_execution.retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | # for records in event: 36 | # for record in records: 37 | # if "processed" not in record or not record["processed"]: 38 | # partial_failure = True 39 | 40 | if not partial_failure: 41 | pipeline_execution.update_pipeline_execution( 42 | status=f"{deployment_instance} {component} Processing", component=component 43 | ) 44 | pipeline_execution.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | pipeline_execution.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 9 | 10 | 11 | def lambda_handler(event, context): 12 | try: 13 | sqs_config = SQSConfiguration(instance=deployment_instance) 14 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 15 | messages = dlq_interface.receive_messages(1) 16 | if not messages: 17 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 18 | return 19 | 20 | logger.info("Received {} messages".format(len(messages))) 21 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 22 | for message in messages: 23 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 24 | logger.info("Redrive message succeeded") 25 | except Exception as e: 26 | logger.error("Fatal error", exc_info=True) 27 | raise e 28 | return 29 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-lambda (sdlf-stageA) 2 | 3 | !!! note 4 | `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png) 9 | 10 | Run a Lambda function. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainA: 20 | Type: awslabs::sdlf::stageA::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pStageName: A 24 | pPipeline: main 25 | pTeamName: iot 26 | pTriggerType: event 27 | pEventPattern: >- 28 | { 29 | "source": ["aws.s3"], 30 | "detail-type": ["Object Created"], 31 | "detail": { 32 | "bucket": { 33 | "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"] 34 | }, 35 | "object": { 36 | "key": [{ "prefix": "iot/legislators/" }] 37 | } 38 | } 39 | } 40 | pEnableTracing: false 41 | ``` 42 | 43 | ## Interface 44 | 45 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters: 46 | 47 | | SSM Parameter | Description | Comment | 48 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 49 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda | | 50 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Functions | | 51 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-lambda" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | #sdlf-pipeline = "^2.10.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-lambda/src/__init__.py -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | from datalake_library.sdlf import SQSConfiguration 7 | 8 | logger = init_logger(__name__) 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | if isinstance(event, str): 15 | event = json.loads(event) 16 | 17 | sqs_config = SQSConfiguration(instance=deployment_instance) 18 | sqs_interface = SQSInterface(sqs_config.stage_dlq) 19 | 20 | logger.info("Execution Failed. Sending original payload to DLQ") 21 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 22 | except Exception as e: 23 | logger.error("Fatal error", exc_info=True) 24 | raise e 25 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI 5 | 6 | logger = init_logger(__name__) 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | """Updates the S3 objects metadata catalog 14 | 15 | Arguments: 16 | event {dict} -- Dictionary with details on previous processing step 17 | context {dict} -- Dictionary with details on Lambda context 18 | 19 | Returns: 20 | {dict} -- Dictionary with outcome of the process 21 | """ 22 | try: 23 | logger.info("Initializing Octagon client") 24 | component = context.function_name.split("-")[-2].title() 25 | pipeline_execution = PipelineExecutionHistoryAPI( 26 | run_in_context="LAMBDA", 27 | region=os.getenv("AWS_REGION"), 28 | peh_table_instance=peh_table_instance, 29 | manifests_table_instance=manifests_table_instance, 30 | ) 31 | peh_id = event[0]["run_output"][0]["transform"]["peh_id"] 32 | pipeline_execution.retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | # for records in event: 36 | # for record in records: 37 | # if "processed" not in record or not record["processed"]: 38 | # partial_failure = True 39 | 40 | if not partial_failure: 41 | pipeline_execution.update_pipeline_execution( 42 | status=f"{deployment_instance} {component} Processing", component=component 43 | ) 44 | pipeline_execution.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | pipeline_execution.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/process-object/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import PurePath 4 | 5 | from datalake_library.commons import init_logger 6 | from datalake_library.interfaces.s3_interface import S3Interface 7 | from datalake_library.sdlf import ( 8 | KMSConfiguration, 9 | S3Configuration, 10 | ) 11 | 12 | logger = init_logger(__name__) 13 | s3_prefix = os.environ["S3_PREFIX"] 14 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 15 | storage_deployment_instance = os.environ["STORAGE_DEPLOYMENT_INSTANCE"] 16 | 17 | 18 | def transform_object(bucket, key): 19 | s3_interface = S3Interface() 20 | # IMPORTANT: Stage bucket where transformed data must be uploaded 21 | stage_bucket = S3Configuration(instance=storage_deployment_instance).stage_bucket 22 | # Download S3 object locally to /tmp directory 23 | # The s3_helper.download_object method 24 | # returns the local path where the file was saved 25 | local_path = s3_interface.download_object(bucket, key) 26 | 27 | # Apply business business logic: 28 | # Below example is opening a JSON file and 29 | # extracting fields, then saving the file 30 | # locally and re-uploading to Stage bucket 31 | def parse(json_data): 32 | l = [] # noqa: E741 33 | for d in json_data: 34 | o = d.copy() 35 | for k in d: 36 | if type(d[k]) in [dict, list]: 37 | o.pop(k) 38 | l.append(o) 39 | 40 | return l 41 | 42 | # Reading file locally 43 | with open(local_path, "r") as raw_file: 44 | data = raw_file.read() 45 | 46 | json_data = json.loads(data) 47 | 48 | # Saving file locally to /tmp after parsing 49 | output_path = f"{PurePath(local_path).with_suffix('')}_parsed.json" 50 | with open(output_path, "w", encoding="utf-8") as write_file: 51 | json.dump(parse(json_data), write_file, ensure_ascii=False, indent=4) 52 | 53 | # Uploading file to Stage bucket at appropriate path 54 | # IMPORTANT: Build the output s3_path without the s3://stage-bucket/ 55 | s3_path = f"{s3_prefix}/{deployment_instance}/{PurePath(output_path).name}" 56 | # IMPORTANT: Notice "stage_bucket" not "bucket" 57 | kms_key = KMSConfiguration(instance=storage_deployment_instance).data_kms_key 58 | s3_interface.upload_object(output_path, stage_bucket, s3_path, kms_key=kms_key) 59 | 60 | return s3_path 61 | 62 | 63 | def lambda_handler(event, context): 64 | """Calls custom transform developed by user 65 | 66 | Arguments: 67 | event {dict} -- Dictionary with details on previous processing step 68 | context {dict} -- Dictionary with details on Lambda context 69 | 70 | Returns: 71 | {dict} -- Dictionary with Processed Bucket and Key(s) 72 | """ 73 | try: 74 | # this default Lambda expects records to be S3 events 75 | for record in event: 76 | logger.info(f"Processing file: {record['object']['key']} in {record['bucket']['name']}") 77 | try: 78 | transform_object(record["bucket"]["name"], record["object"]["key"]) 79 | record["processed"] = True 80 | except json.decoder.JSONDecodeError as e: 81 | record["processed"] = False 82 | record["error"] = repr(e) 83 | 84 | except Exception as e: 85 | logger.error("Fatal error", exc_info=True) 86 | raise e 87 | 88 | return event 89 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.interfaces.sqs_interface import SQSInterface 5 | from datalake_library.sdlf import SQSConfiguration 6 | 7 | logger = init_logger(__name__) 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 9 | 10 | 11 | def lambda_handler(event, context): 12 | try: 13 | sqs_config = SQSConfiguration(instance=deployment_instance) 14 | dlq_interface = SQSInterface(sqs_config.stage_dlq) 15 | messages = dlq_interface.receive_messages(1) 16 | if not messages: 17 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 18 | return 19 | 20 | logger.info("Received {} messages".format(len(messages))) 21 | queue_interface = SQSInterface(sqs_config.stage_queue) 22 | for message in messages: 23 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 24 | logger.info("Redrive message succeeded") 25 | except Exception as e: 26 | logger.error("Fatal error", exc_info=True) 27 | raise e 28 | return 29 | -------------------------------------------------------------------------------- /sdlf-stageA/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | if isinstance(event, str): 13 | event = json.loads(event) 14 | sqs_config = SQSConfiguration(event["team"], event["pipeline"], event["pipeline_stage"]) 15 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 16 | 17 | logger.info("Execution Failed. Sending original payload to DLQ") 18 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 19 | except Exception as e: 20 | logger.error("Fatal error", exc_info=True) 21 | raise e 22 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | from datalake_library import octagon 2 | from datalake_library.commons import init_logger 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration, S3Configuration 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 5 | from datalake_library.interfaces.s3_interface import S3Interface 6 | from datalake_library.octagon import peh 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | def lambda_handler(event, context): 12 | """Updates the S3 objects metadata catalog 13 | 14 | Arguments: 15 | event {dict} -- Dictionary with details on previous processing step 16 | context {dict} -- Dictionary with details on Lambda context 17 | 18 | Returns: 19 | {dict} -- Dictionary with outcome of the process 20 | """ 21 | try: 22 | logger.info("Fetching event data from previous step") 23 | processed_keys = event["body"]["processedKeys"] 24 | team = event["body"]["team"] 25 | pipeline = event["body"]["pipeline"] 26 | stage = event["body"]["pipeline_stage"] 27 | dataset = event["body"]["dataset"] 28 | peh_id = event["body"]["peh_id"] 29 | 30 | logger.info("Initializing Octagon client") 31 | component = context.function_name.split("-")[-2].title() 32 | octagon_client = ( 33 | octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build() 34 | ) 35 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 36 | 37 | logger.info("Initializing DynamoDB config and Interface") 38 | dynamo_config = DynamoConfiguration() 39 | dynamo_interface = DynamoInterface(dynamo_config) 40 | 41 | logger.info("Storing metadata to DynamoDB") 42 | bucket = S3Configuration().stage_bucket 43 | for key in processed_keys: 44 | size, last_modified_date = S3Interface().get_size_and_last_modified(bucket, key) 45 | object_metadata = { 46 | "bucket": bucket, 47 | "key": key, 48 | "size": size, 49 | "last_modified_date": last_modified_date, 50 | "org": event["body"]["org"], 51 | "app": event["body"]["domain"], 52 | "env": event["body"]["env"], 53 | "team": team, 54 | "pipeline": pipeline, 55 | "dataset": dataset, 56 | "stage": "stage", 57 | "pipeline_stage": stage, 58 | "peh_id": peh_id, 59 | } 60 | 61 | dynamo_interface.update_object_metadata_catalog(object_metadata) 62 | 63 | octagon_client.update_pipeline_execution( 64 | status="{} {} Processing".format(stage, component), component=component 65 | ) 66 | octagon_client.end_pipeline_execution_success() 67 | except Exception as e: 68 | logger.error("Fatal error", exc_info=True) 69 | octagon_client.end_pipeline_execution_failed( 70 | component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e)) 71 | ) 72 | raise e 73 | return 200 74 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-preupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import octagon 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import DynamoConfiguration 6 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | def get_lambda_transform_details(team, dataset, pipeline, stage): 12 | dynamo_config = DynamoConfiguration() 13 | dynamo_interface = DynamoInterface(dynamo_config) 14 | transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}") 15 | lambda_arn = os.getenv("STAGE_TRANSFORM_LAMBDA") 16 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 17 | if pipeline in transform_info.get("pipeline", {}): 18 | if stage in transform_info["pipeline"][pipeline]: 19 | logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}") 20 | lambda_arn = transform_info["pipeline"][pipeline][stage].get("lambda_arn", lambda_arn) 21 | ####################################################### 22 | # We assume a Lambda function has already been created based on 23 | # customer needs. 24 | ####################################################### 25 | 26 | return {"lambda_arn": lambda_arn} 27 | 28 | 29 | def lambda_handler(event, context): 30 | """Updates the objects metadata catalog 31 | 32 | Arguments: 33 | event {dict} -- Dictionary with details on S3 event 34 | context {dict} -- Dictionary with details on Lambda context 35 | 36 | Returns: 37 | {dict} -- Dictionary with Processed Bucket and Key 38 | """ 39 | try: 40 | logger.info("Fetching event data from previous step") 41 | team = event["team"] 42 | pipeline = event["pipeline"] 43 | stage = event["pipeline_stage"] 44 | dataset = event["dataset"] 45 | 46 | logger.info("Initializing Octagon client") 47 | component = context.function_name.split("-")[-2].title() 48 | octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["env"]).build() 49 | event["peh_id"] = octagon_client.start_pipeline_execution( 50 | pipeline_name="{}-{}-{}".format(team, pipeline, stage), 51 | dataset_name="{}-{}".format(team, dataset), 52 | comment=event, 53 | ) 54 | # Add business metadata (e.g. event['project'] = 'xyz') 55 | 56 | logger.info("Initializing DynamoDB config and Interface") 57 | dynamo_config = DynamoConfiguration() 58 | dynamo_interface = DynamoInterface(dynamo_config) 59 | 60 | logger.info("Storing metadata to DynamoDB") 61 | dynamo_interface.update_object_metadata_catalog(event) 62 | 63 | logger.info("Passing arguments to the next function of the state machine") 64 | octagon_client.update_pipeline_execution( 65 | status="{} {} Processing".format(stage, component), component=component 66 | ) 67 | 68 | event["lambda"] = get_lambda_transform_details(team, dataset, pipeline, stage) # custom user code called 69 | except Exception as e: 70 | logger.error("Fatal error", exc_info=True) 71 | octagon_client.end_pipeline_execution_failed( 72 | component=component, 73 | issue_comment="{} {} Error: {}".format(stage, component, repr(e)), 74 | ) 75 | raise e 76 | return {"statusCode": 200, "body": event} 77 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"]) 13 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 14 | messages = dlq_interface.receive_messages(1) 15 | if not messages: 16 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 17 | return 18 | 19 | logger.info("Received {} messages".format(len(messages))) 20 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 21 | for message in messages: 22 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 23 | logger.info("Redrive message succeeded") 24 | except Exception as e: 25 | logger.error("Fatal error", exc_info=True) 26 | raise e 27 | return 28 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-routing/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import StateMachineConfiguration 6 | from datalake_library.interfaces.states_interface import StatesInterface 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | def lambda_handler(event, context): 12 | try: 13 | logger.info("Received {} messages".format(len(event["Records"]))) 14 | for record in event["Records"]: 15 | logger.info("Starting State Machine Execution") 16 | event_body = json.loads(record["body"]) 17 | object_key = event_body["object"]["key"].split("/") 18 | team = object_key[0] 19 | dataset = object_key[1] 20 | pipeline = os.environ["PIPELINE"] 21 | pipeline_stage = os.environ["PIPELINE_STAGE"] 22 | org = os.environ["ORG"] 23 | domain = os.environ["DOMAIN"] 24 | env = os.environ["ENV"] 25 | 26 | event_with_pipeline_details = { 27 | **event_body["object"], 28 | "bucket": event_body["bucket"]["name"], 29 | "team": team, 30 | "dataset": dataset, 31 | "pipeline": pipeline, 32 | "pipeline_stage": pipeline_stage, 33 | "org": org, 34 | "domain": domain, 35 | "env": env, 36 | } 37 | 38 | state_config = StateMachineConfiguration(team, pipeline, pipeline_stage) 39 | StatesInterface().run_state_machine( 40 | state_config.get_stage_state_machine_arn, json.dumps(event_with_pipeline_details) 41 | ) 42 | except Exception as e: 43 | logger.error("Fatal error", exc_info=True) 44 | raise e 45 | -------------------------------------------------------------------------------- /sdlf-stageA/state-machine/stage-a.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Simple Lambda-based transform", 3 | "StartAt": "Try", 4 | "States": { 5 | "Try": { 6 | "Type": "Parallel", 7 | "Branches": [ 8 | { 9 | "StartAt": "Pre-update Catalog", 10 | "States": { 11 | "Pre-update Catalog": { 12 | "Type": "Task", 13 | "Resource": "arn:aws:states:::lambda:invoke", 14 | "OutputPath": "$.Payload", 15 | "Parameters": { 16 | "Payload.$": "$", 17 | "FunctionName": "${lStep1}:$LATEST" 18 | }, 19 | "Retry": [ 20 | { 21 | "ErrorEquals": [ 22 | "Lambda.ServiceException", 23 | "Lambda.AWSLambdaException", 24 | "Lambda.SdkClientException", 25 | "Lambda.TooManyRequestsException" 26 | ], 27 | "IntervalSeconds": 2, 28 | "MaxAttempts": 6, 29 | "BackoffRate": 2 30 | } 31 | ], 32 | "Next": "Execute Light Transformation" 33 | }, 34 | "Execute Light Transformation": { 35 | "Type": "Task", 36 | "Resource": "arn:aws:states:::lambda:invoke", 37 | "OutputPath": "$.Payload", 38 | "Parameters": { 39 | "Payload.$": "$", 40 | "FunctionName.$": "$.body.lambda.lambda_arn" 41 | }, 42 | "Retry": [ 43 | { 44 | "ErrorEquals": [ 45 | "Lambda.ServiceException", 46 | "Lambda.AWSLambdaException", 47 | "Lambda.SdkClientException", 48 | "Lambda.TooManyRequestsException" 49 | ], 50 | "IntervalSeconds": 2, 51 | "MaxAttempts": 6, 52 | "BackoffRate": 2 53 | } 54 | ], 55 | "Next": "Post-update Catalog" 56 | }, 57 | "Post-update Catalog": { 58 | "Type": "Task", 59 | "Resource": "arn:aws:states:::lambda:invoke", 60 | "ResultPath": null, 61 | "Parameters": { 62 | "Payload.$": "$", 63 | "FunctionName": "${lStep3}:$LATEST" 64 | }, 65 | "Retry": [ 66 | { 67 | "ErrorEquals": [ 68 | "Lambda.ServiceException", 69 | "Lambda.AWSLambdaException", 70 | "Lambda.SdkClientException", 71 | "Lambda.TooManyRequestsException" 72 | ], 73 | "IntervalSeconds": 2, 74 | "MaxAttempts": 6, 75 | "BackoffRate": 2 76 | } 77 | ], 78 | "End": true 79 | } 80 | } 81 | } 82 | ], 83 | "End": true, 84 | "Catch": [ 85 | { 86 | "ErrorEquals": [ 87 | "States.ALL" 88 | ], 89 | "ResultPath": null, 90 | "Next": "Error" 91 | } 92 | ] 93 | }, 94 | "Error": { 95 | "Type": "Task", 96 | "Resource": "arn:aws:states:::lambda:invoke", 97 | "OutputPath": "$.Payload", 98 | "Parameters": { 99 | "Payload.$": "$", 100 | "FunctionName": "${lError}:$LATEST" 101 | }, 102 | "Retry": [ 103 | { 104 | "ErrorEquals": [ 105 | "Lambda.ServiceException", 106 | "Lambda.AWSLambdaException", 107 | "Lambda.SdkClientException", 108 | "Lambda.TooManyRequestsException" 109 | ], 110 | "IntervalSeconds": 2, 111 | "MaxAttempts": 6, 112 | "BackoffRate": 2 113 | } 114 | ], 115 | "Next": "Fail" 116 | }, 117 | "Fail": { 118 | "Type": "Fail" 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /sdlf-stageB/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | if isinstance(event, str): 13 | event = json.loads(event) 14 | sqs_config = SQSConfiguration(event["body"]["team"], event["body"]["pipeline"], event["body"]["pipeline_stage"]) 15 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 16 | 17 | logger.info("Execution Failed. Sending original payload to DLQ") 18 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 19 | except Exception as e: 20 | logger.error("Fatal error", exc_info=True) 21 | raise e 22 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-fetch-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | from datalake_library import octagon 2 | from datalake_library.commons import init_logger 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 5 | 6 | logger = init_logger(__name__) 7 | 8 | 9 | def get_glue_transform_details(bucket, team, dataset, pipeline, stage): 10 | dynamo_config = DynamoConfiguration() 11 | dynamo_interface = DynamoInterface(dynamo_config) 12 | transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}") 13 | # we assume a Glue Job has already been created based on customer needs 14 | job_name = f"sdlf-{team}-{dataset}-glue-job" # Name of the Glue Job 15 | glue_capacity = {"WorkerType": "G.1X", "NumberOfWorkers": 10} 16 | wait_time = 60 17 | glue_arguments = { 18 | # Specify any arguments needed based on bucket and keys (e.g. input/output S3 locations) 19 | "--SOURCE_LOCATION": f"s3://{bucket}/pre-stage/{team}/{dataset}", 20 | "--OUTPUT_LOCATION": f"s3://{bucket}/post-stage/{team}/{dataset}", 21 | "--job-bookmark-option": "job-bookmark-enable", 22 | } 23 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 24 | if pipeline in transform_info.get("pipeline", {}): 25 | if stage in transform_info["pipeline"][pipeline]: 26 | logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}") 27 | job_name = transform_info["pipeline"][pipeline][stage].get("job_name", job_name) 28 | glue_capacity = transform_info["pipeline"][pipeline][stage].get("glue_capacity", glue_capacity) 29 | wait_time = transform_info["pipeline"][pipeline][stage].get("wait_time", wait_time) 30 | glue_arguments |= transform_info["pipeline"][pipeline][stage].get("glue_extra_arguments", {}) 31 | 32 | return {"job_name": job_name, "wait_time": wait_time, "arguments": glue_arguments, **glue_capacity} 33 | 34 | 35 | def lambda_handler(event, context): 36 | """Calls custom transform developed by user 37 | 38 | Arguments: 39 | event {dict} -- Dictionary with details on previous processing step 40 | context {dict} -- Dictionary with details on Lambda context 41 | 42 | Returns: 43 | {dict} -- Dictionary with Processed Bucket and Key(s) 44 | """ 45 | try: 46 | logger.info("Fetching event data from previous step") 47 | bucket = event["body"]["bucket"] 48 | team = event["body"]["team"] 49 | pipeline = event["body"]["pipeline"] 50 | stage = event["body"]["pipeline_stage"] 51 | dataset = event["body"]["dataset"] 52 | 53 | logger.info("Initializing Octagon client") 54 | component = context.function_name.split("-")[-2].title() 55 | octagon_client = ( 56 | octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build() 57 | ) 58 | peh_id = octagon_client.start_pipeline_execution( 59 | pipeline_name="{}-{}-{}".format(team, pipeline, stage), 60 | dataset_name="{}-{}".format(team, dataset), 61 | comment=event, 62 | ) 63 | 64 | # Call custom transform created by user and process the file 65 | logger.info("Calling user custom processing code") 66 | event["body"]["glue"] = get_glue_transform_details( 67 | bucket, team, dataset, pipeline, stage 68 | ) # custom user code called 69 | event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"]) 70 | event["body"]["peh_id"] = peh_id 71 | octagon_client.update_pipeline_execution( 72 | status="{} {} Processing".format(stage, component), component=component 73 | ) 74 | except Exception as e: 75 | logger.error("Fatal error", exc_info=True) 76 | octagon_client.end_pipeline_execution_failed( 77 | component=component, 78 | issue_comment="{} {} Error: {}".format(stage, component, repr(e)), 79 | ) 80 | raise e 81 | return event 82 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | from datalake_library import octagon 2 | from datalake_library.commons import init_logger 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 5 | from datalake_library.interfaces.s3_interface import S3Interface 6 | from datalake_library.octagon import peh 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | def lambda_handler(event, context): 12 | """Updates the S3 objects metadata catalog 13 | 14 | Arguments: 15 | event {dict} -- Dictionary with details on Bucket and Keys 16 | context {dict} -- Dictionary with details on Lambda context 17 | 18 | Returns: 19 | {dict} -- Dictionary with response 20 | """ 21 | try: 22 | logger.info("Fetching event data from previous step") 23 | bucket = event["body"]["bucket"] 24 | team = event["body"]["team"] 25 | pipeline = event["body"]["pipeline"] 26 | stage = event["body"]["pipeline_stage"] 27 | dataset = event["body"]["dataset"] 28 | peh_id = event["body"]["peh_id"] 29 | processed_keys_path = f"post-stage/{team}/{dataset}" 30 | processed_keys = S3Interface().list_objects(bucket, processed_keys_path) 31 | 32 | logger.info("Initializing Octagon client") 33 | component = context.function_name.split("-")[-2].title() 34 | octagon_client = ( 35 | octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build() 36 | ) 37 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 38 | 39 | logger.info("Initializing DynamoDB config and Interface") 40 | dynamo_config = DynamoConfiguration() 41 | dynamo_interface = DynamoInterface(dynamo_config) 42 | 43 | logger.info("Storing metadata to DynamoDB") 44 | all_objects_metadata = [] 45 | for key in processed_keys: 46 | size, last_modified_date = S3Interface().get_size_and_last_modified(bucket, key) 47 | object_metadata = { 48 | "bucket": bucket, 49 | "key": key, 50 | "size": size, 51 | "last_modified_date": last_modified_date, 52 | "org": event["body"]["org"], 53 | "app": event["body"]["domain"], 54 | "env": event["body"]["env"], 55 | "team": team, 56 | "pipeline": pipeline, 57 | "dataset": dataset, 58 | "stage": "stage", 59 | "pipeline_stage": stage, 60 | "peh_id": peh_id, 61 | } 62 | all_objects_metadata.append(object_metadata) 63 | dynamo_interface.batch_update_object_metadata_catalog(all_objects_metadata) 64 | 65 | octagon_client.update_pipeline_execution( 66 | status="{} {} Processing".format(stage, component), component=component 67 | ) 68 | octagon_client.end_pipeline_execution_success() 69 | except Exception as e: 70 | logger.error("Fatal error", exc_info=True) 71 | octagon_client.end_pipeline_execution_failed( 72 | component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e)) 73 | ) 74 | raise e 75 | return 200 76 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration, StateMachineConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | from datalake_library.interfaces.states_interface import StatesInterface 8 | 9 | logger = init_logger(__name__) 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | team = os.environ["TEAM"] 15 | pipeline = os.environ["PIPELINE"] 16 | stage = os.environ["STAGE"] 17 | state_config = StateMachineConfiguration(team, pipeline, stage) 18 | sqs_config = SQSConfiguration(team, pipeline, stage) 19 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 20 | 21 | messages = dlq_interface.receive_messages(1) 22 | if not messages: 23 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 24 | return 25 | 26 | logger.info("Received {} messages".format(len(messages))) 27 | for message in messages: 28 | logger.info("Starting State Machine Execution") 29 | if isinstance(message["Body"], str): 30 | response = json.loads(message["Body"]) 31 | StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response) 32 | logger.info("Redrive message succeeded") 33 | except Exception as e: 34 | logger.error("Fatal error", exc_info=True) 35 | raise e 36 | return 37 | -------------------------------------------------------------------------------- /sdlf-team/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-team/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-team 2 | 3 | !!! note 4 | `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"} 9 | 10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies. 11 | 12 | As such `sdlf-team` is mostly about permissions. 13 | 14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`. 15 | 16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 17 | 18 | !!! warning 19 | The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members. 20 | 21 | ## Usage 22 | 23 | ### CloudFormation with [sdlf-cicd](cicd.md) 24 | 25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 26 | 27 | ``` 28 | rExample: 29 | Type: awslabs::sdlf::team::MODULE 30 | Properties: 31 | pPipelineReference: !Ref pPipelineReference 32 | pTeamName: industry 33 | pEnvironment: dev 34 | pSNSNotificationsEmail: nobody@amazon.com 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Athena/{team}/WorkgroupName` | Team Athena workgroup name | | 44 | | `/SDLF/EventBridge/{team}/EventBusName` | Name of the team dedicated event bus | | 45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName` | Name of the team dedicated schedule group | | 46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name | | 47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn` | IAM Role ARN for Glue crawlers | | 48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary` | ARN of the permissions boundary IAM Managed policy for the team | | 49 | | `/SDLF/KMS/${pTeamName}/DataKeyId` | ARN of the team KMS data key | | 50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId` | ARN of the team KMS infrastructure key | | 51 | | `/SDLF/SNS/${pTeamName}/Notifications` | ARN of the team-specific SNS Topic | | 52 | -------------------------------------------------------------------------------- /sdlf-team/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.team" 3 | version = "2.10.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /sdlf-team/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-team/src/__init__.py -------------------------------------------------------------------------------- /sdlf-team/src/lambda/datasets-dynamodb/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import boto3 6 | from boto3.dynamodb.types import TypeSerializer 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(logging.INFO) 10 | 11 | dynamodb = boto3.client("dynamodb") 12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 14 | 15 | 16 | def delete_dynamodb_dataset_entry(table_name, team_name, dataset_name): 17 | response = dynamodb.delete_item( 18 | TableName=table_name, 19 | Key={"name": {"S": f"{team_name}-{dataset_name}"}}, 20 | ) 21 | return response 22 | 23 | 24 | def create_dynamodb_dataset_entry(table_name, team_name, dataset_name, pipeline_details): 25 | pipeline_details_dynamodb_json = TypeSerializer().serialize(pipeline_details) 26 | logger.info("PIPELINE DETAILS DYNAMODB JSON: %s", pipeline_details_dynamodb_json) 27 | response = dynamodb.update_item( 28 | TableName=table_name, 29 | Key={"name": {"S": f"{team_name}-{dataset_name}"}}, 30 | ExpressionAttributeNames={ 31 | "#P": "pipeline", 32 | "#V": "version", 33 | }, 34 | ExpressionAttributeValues={ 35 | ":p": pipeline_details_dynamodb_json, 36 | ":v": {"N": "1"}, 37 | }, 38 | UpdateExpression="SET #P = :p, #V = :v", 39 | ReturnValues="UPDATED_NEW", 40 | ) 41 | return response 42 | 43 | 44 | def lambda_handler(event, context): 45 | try: 46 | environment = os.getenv("ENVIRONMENT") 47 | team_name = os.getenv("TEAM_NAME") 48 | table = f"octagon-Datasets-{environment}" 49 | 50 | paginator = ssm.get_paginator("get_parameters_by_path") 51 | datasets_pages = paginator.paginate(Path=f"/SDLF/Datasets/{team_name}") 52 | 53 | for datasets_page in datasets_pages: 54 | for dataset in datasets_page["Parameters"]: 55 | dataset_name = dataset["Name"].split("/")[-1] 56 | logger.info("DATASET SSM CONTENT: %s", dataset["Value"]) 57 | dataset_pipeline_details = json.loads(dataset["Value"]) 58 | create_dynamodb_dataset_entry(table, team_name, dataset_name, dataset_pipeline_details) 59 | logger.info(f"{team_name}-{dataset_name} DynamoDB Dataset entry created") 60 | 61 | logger.info("INFO: Entries for datasets that no longer exist are not removed from DynamoDB") 62 | except Exception as e: 63 | message = "Function exception: " + str(e) 64 | logger.error(message, exc_info=True) 65 | raise 66 | 67 | return "Success" 68 | -------------------------------------------------------------------------------- /sdlf-team/src/lambda/pipelines-dynamodb/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | 9 | dynamodb = boto3.client("dynamodb") 10 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 11 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 12 | 13 | 14 | def delete_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name): 15 | response = dynamodb.delete_item( 16 | TableName=table_name, 17 | Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}}, 18 | ) 19 | return response 20 | 21 | 22 | def create_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name): 23 | response = dynamodb.update_item( 24 | TableName=table_name, 25 | Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}}, 26 | ExpressionAttributeNames={ 27 | "#T": "type", 28 | "#S": "status", 29 | "#P": "pipeline", 30 | "#V": "version", 31 | }, 32 | ExpressionAttributeValues={ 33 | ":t": { 34 | "S": "TRANSFORMATION", 35 | }, 36 | ":s": {"S": "ACTIVE"}, 37 | ":p": {"M": {"max_items_process": {"N": "100"}, "min_items_process": {"N": "1"}}}, 38 | ":v": {"N": "1"}, 39 | }, 40 | UpdateExpression="SET #T = :t, #S = :s, #P = :p, #V = :v", 41 | ReturnValues="UPDATED_NEW", 42 | ) 43 | return response 44 | 45 | 46 | def lambda_handler(event, context): 47 | try: 48 | environment = os.getenv("ENVIRONMENT") 49 | team_name = os.getenv("TEAM_NAME") 50 | table = f"octagon-Pipelines-{environment}" 51 | 52 | paginator = ssm.get_paginator("get_parameters_by_path") 53 | stages_pages = paginator.paginate( 54 | Path=f"/SDLF/Pipelines/{team_name}", 55 | Recursive=True, 56 | ) 57 | for stages_page in stages_pages: 58 | for stage in stages_page["Parameters"]: 59 | pipeline_name = stage["Name"].split("/")[-2] 60 | stage_name = stage["Name"].split("/")[-1] 61 | create_dynamodb_pipeline_entry(table, team_name, pipeline_name, stage_name) 62 | logger.info(f"{team_name}-{pipeline_name}-{stage_name} DynamoDB Pipeline entry created") 63 | 64 | logger.info("INFO: Entries for stages that no longer exist are *not* removed from DynamoDB") 65 | except Exception as e: 66 | message = "Function exception: " + str(e) 67 | logger.error(message, exc_info=True) 68 | raise 69 | 70 | return "Success" 71 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/dataset-legislators.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Example datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/dataset/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pS3Prefix: legislators 17 | pDeploymentInstance: dev 18 | pStorageDeploymentInstance: dev 19 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/foundations-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: SDLF Foundations in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rAnycompany: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/foundations/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pChildAccountId: !Ref AWS::AccountId 17 | pOrg: anycompany 18 | pDomain: datalake 19 | pDeploymentInstance: dev 20 | pCicdRole: Admin 21 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/stagelambda/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pDeploymentInstance: mainA 17 | pStorageDeploymentInstance: dev 18 | pDatasetDeploymentInstance: dev 19 | pTriggerType: event 20 | pEventPattern: >- 21 | { 22 | "source": ["aws.s3"], 23 | "detail-type": ["Object Created"], 24 | "detail": { 25 | "bucket": { 26 | "name": ["{{resolve:ssm:/sdlf/storage/rRawBucket/dev}}"] 27 | }, 28 | "object": { 29 | "key": [{ "prefix": "legislators/" }] 30 | } 31 | } 32 | } 33 | pEnableTracing: false 34 | 35 | rMainB: 36 | Type: AWS::CloudFormation::Stack 37 | Properties: 38 | TemplateURL: "{{resolve:ssm:/sdlf/stageglue/main}}" 39 | Parameters: 40 | pPipelineReference: !Ref pPipelineReference 41 | pDeploymentInstance: mainB 42 | pStorageDeploymentInstance: dev 43 | pDatasetDeploymentInstance: dev 44 | pGlueJobName: sdlf-mainB-glue-job 45 | pGlueNumberOfWorkers: 10 46 | pGlueWorkerType: G.1X 47 | pTriggerType: schedule 48 | pEventPattern: >- 49 | { 50 | "source": ["aws.s3"], 51 | "detail-type": ["Object Created"], 52 | "detail": { 53 | "bucket": { 54 | "name": ["{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}"] 55 | }, 56 | "object": { 57 | "key": [{ "prefix": "legislators/mainA/" }] 58 | } 59 | } 60 | } 61 | pSchedule: "cron(*/5 * * * ? *)" 62 | pEnableTracing: false 63 | pGlueArguments: >- 64 | { 65 | "--job-bookmark-option": "job-bookmark-enable", 66 | "--enable-metrics": "", 67 | "--enable-auto-scaling": "true", 68 | "--SOURCE_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}/legislators/mainA", 69 | "--OUTPUT_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rAnalyticsBucket/dev}}/legislators/mainB" 70 | } 71 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/team-datalake-engineering-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering SDLF Team in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rEngineering: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/team/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pTeamName: engineering 17 | pStorageDeploymentInstance: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/datasets.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Engineering team datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: awslabs::sdlf::dataset::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: engineering 15 | pDatasetName: legislators 16 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering team Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: main 16 | pTeamName: engineering 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "engineering/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | 33 | rMainB: 34 | Type: awslabs::sdlf::stageB::MODULE 35 | Properties: 36 | pPipelineReference: !Ref pPipelineReference 37 | pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}" 38 | pStageName: B 39 | pPipeline: main 40 | pTeamName: engineering 41 | pTriggerType: schedule 42 | pEventPattern: !Sub >- 43 | { 44 | "source": ["aws.states"], 45 | "detail-type": ["Step Functions Execution Status Change"], 46 | "detail": { 47 | "status": ["SUCCEEDED"], 48 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-engineering-main-sm-A"] 49 | } 50 | } 51 | pSchedule: "cron(*/5 * * * ? *)" 52 | pEnableTracing: false 53 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipelines.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering team pipelines 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMain: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./pipeline-main.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | Tags: 19 | - Key: sdlf:pipeline 20 | Value: main 21 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/datadomain-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: datalake data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rForecourt: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-datalake-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rEngineering: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rForecourt 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-datalake-engineering-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: engineering 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/foundations-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rForecourt: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: !Ref "AWS::AccountId" 15 | pOrg: forecourt 16 | pDomain: datalake 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/team-datalake-engineering-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Engineering SDLF Team in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rEngineering: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: engineering 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/datasets.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: iot team datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: awslabs::sdlf::dataset::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: iot 15 | pDatasetName: legislators 16 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: main 16 | pTeamName: iot 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "iot/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | 33 | rMainB: 34 | Type: awslabs::sdlf::stageB::MODULE 35 | Properties: 36 | pPipelineReference: !Ref pPipelineReference 37 | pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}" 38 | pStageName: B 39 | pPipeline: main 40 | pTeamName: iot 41 | pTriggerType: schedule 42 | pEventPattern: !Sub >- 43 | { 44 | "source": ["aws.states"], 45 | "detail-type": ["Step Functions Execution Status Change"], 46 | "detail": { 47 | "status": ["SUCCEEDED"], 48 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 49 | } 50 | } 51 | pSchedule: "cron(*/5 * * * ? *)" 52 | pEnableTracing: false 53 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-singlestage.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Single stage pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rSingleA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: singlestage 16 | pTeamName: iot 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "iot/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipelines.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: iot team pipelines 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMain: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./pipeline-main.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | Tags: 19 | - Key: sdlf:pipeline 20 | Value: main 21 | rSingleStage: 22 | Type: AWS::CloudFormation::Stack 23 | DeletionPolicy: Delete 24 | UpdateReplacePolicy: Delete 25 | Properties: 26 | TemplateURL: ./pipeline-singlestage.yaml 27 | Parameters: 28 | pPipelineReference: !Ref pPipelineReference 29 | Tags: 30 | - Key: sdlf:pipeline 31 | Value: singlestage 32 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf", 4 | "sdlf:team" : "iot" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-marketing-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: marketing data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMarketing: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-marketing-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rIndustry: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rMarketing 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-marketing-industry-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: industry 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-proserve-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: proserve data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rProserve: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-proserve-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rIot: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rProserve 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-proserve-iot-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: iot 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-marketing-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in marketing domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMarketing: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: 222222222222 15 | pOrg: forecourt 16 | pDomain: marketing 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-proserve-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in proserve domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rProserve: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: 111111111111 15 | pOrg: forecourt 16 | pDomain: proserve 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/team-marketing-industry-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Industry SDLF Team in marketing domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rIndustry: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: industry 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/team-proserve-iot-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: IOT SDLF Team in proserve domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rIot: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: iot 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/clean-up.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | STORAGE_DEPLOYMENT_INSTANCE=dev 4 | DATASET_DEPLOYMENT_INSTANCE=dev 5 | TEAM_NAME=engineering 6 | #PRINCIPAL= 7 | 8 | # echo "Granting Drop on Glue DBs" 9 | # SDLF_ORG=$(aws ssm get-parameter --name "/sdlf/storage/rOrganization/$STORAGE_DEPLOYMENT_INSTANCE" --query "Parameter.Value" --output text) 10 | # for DB in $(aws glue get-databases | jq -r '.[][].Name') 11 | # do 12 | # case "$DB" in 13 | # $SDLF_ORG*) aws lakeformation grant-permissions --principal DataLakePrincipalIdentifier="$PRINCIPAL" --permissions DROP --resource $(echo \'{\"Database\":{\"Name\":\"$DB\"}}\' | tr -d \');; 14 | # *) echo "Skipping non-SDLF database" ;; 15 | # esac 16 | # done 17 | 18 | echo "Fetch KMS keys ARN - SSM parameters won't be available once stacks have been deleted" 19 | declare -a KEYS=("/sdlf/storage/rKMSKey/$STORAGE_DEPLOYMENT_INSTANCE" 20 | "/sdlf/dataset/rKMSInfraKey/$DATASET_DEPLOYMENT_INSTANCE" 21 | "/sdlf/dataset/rKMSDataKey/$DATASET_DEPLOYMENT_INSTANCE" 22 | "/SDLF/KMS/$TEAM_NAME/InfraKeyId" 23 | ) 24 | KEYS_ARN=() 25 | for KEY in "${KEYS[@]}" 26 | do 27 | echo "Finding $KEY ARN" 28 | if KEY_ARN=$(aws ssm get-parameter --name "$KEY" --query "Parameter.Value" --output text); then 29 | KEYS_ARN+=("$KEY_ARN") 30 | else 31 | echo "Key does not exist, skipping" 32 | fi 33 | done 34 | 35 | echo "Emptying SDLF buckets..." 36 | declare -a BUCKETS=("/sdlf/storage/rArtifactsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 37 | "/sdlf/storage/rRawBucket/$STORAGE_DEPLOYMENT_INSTANCE" 38 | "/sdlf/storage/rStageBucket/$STORAGE_DEPLOYMENT_INSTANCE" 39 | "/sdlf/storage/rAnalyticsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 40 | "/sdlf/storage/rAthenaBucket/$STORAGE_DEPLOYMENT_INSTANCE" 41 | "/sdlf/storage/rS3AccessLogsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 42 | ) 43 | for BUCKET in "${BUCKETS[@]}" 44 | do 45 | echo "Finding $BUCKET bucket name" 46 | if S3_BUCKET=$(aws ssm get-parameter --name "$BUCKET" --query "Parameter.Value" --output text); then 47 | echo "Emptying $S3_BUCKET" 48 | aws s3 rm "s3://$S3_BUCKET" --recursive 49 | if [ "$(aws s3api get-bucket-versioning --bucket "$S3_BUCKET" --output text)" == "Enabled" ]; then 50 | objects_versions=$(aws s3api list-object-versions --bucket "$S3_BUCKET" --output=json --query='{Objects: Versions[].{Key:Key,VersionId:VersionId}}') 51 | if [ "$(jq -r ".Objects" <<< "$objects_versions")" != "null" ]; then 52 | aws s3api delete-objects --bucket "$S3_BUCKET" --delete "$objects_versions" 53 | fi 54 | fi 55 | else 56 | echo "Bucket does not exist, skipping" 57 | fi 58 | done 59 | 60 | echo "Deleting SDLF stacks..." 61 | STACKS=$(aws cloudformation list-stacks --query "StackSummaries[?starts_with(StackName,'sdlf-') && StackStatus!='DELETE_COMPLETE']" | jq -r "sort_by(.CreationTime) | reverse[] | select(.ParentId == null) | .StackName") 62 | for STACK in $STACKS 63 | do 64 | echo "Deleting stack $STACK" 65 | aws cloudformation delete-stack --stack-name "$STACK" 66 | done 67 | for STACK in $STACKS 68 | do 69 | echo "Waiting for $STACK stack delete to complete ..." && aws cloudformation wait stack-delete-complete --stack-name "$STACK" && echo "Finished delete successfully!" 70 | done 71 | 72 | echo "Deleting KMS keys" 73 | for KEY_ARN in "${KEYS_ARN[@]}" 74 | do 75 | echo "Deleting $KEY_ARN" 76 | aws kms schedule-key-deletion --key-id "$KEY_ARN" --pending-window-in-days 7 2>&1 77 | done 78 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pflag=false 3 | 4 | DIRNAME=$(dirname "$0") 5 | 6 | usage () { echo " 7 | -h -- Opens up this help message 8 | -p -- Name of the AWS profile to use 9 | "; } 10 | options=':p:h' 11 | while getopts "$options" option 12 | do 13 | case "$option" in 14 | p ) pflag=true; PROFILE=$OPTARG;; 15 | h ) usage; exit;; 16 | \? ) echo "Unknown option: -$OPTARG" >&2; exit 1;; 17 | : ) echo "Missing option argument for -$OPTARG" >&2; exit 1;; 18 | * ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;; 19 | esac 20 | done 21 | 22 | if "$pflag" 23 | then 24 | echo "using AWS profile $PROFILE..." >&2 25 | fi 26 | REGION=$(aws configure get region ${PROFILE:+--profile "$PROFILE"}) 27 | 28 | ARTIFACTS_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rArtifactsBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 29 | aws s3 cp "$DIRNAME/scripts/legislators-glue-job.py" "s3://$ARTIFACTS_BUCKET/artifacts/" ${PROFILE:+--profile "$PROFILE"} 30 | 31 | mkdir "$DIRNAME"/output 32 | 33 | function send_legislators() 34 | { 35 | ORIGIN="$DIRNAME/data/" 36 | 37 | RAW_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rRawBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 38 | KMS_KEY=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/dataset/rKMSDataKey/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 39 | 40 | S3_DESTINATION=s3://$RAW_BUCKET/ 41 | COUNT=0 42 | for FILE in "$ORIGIN"/*.json; 43 | do 44 | (( COUNT++ )) || true 45 | aws s3 cp "$FILE" "${S3_DESTINATION}legislators/" --sse aws:kms --sse-kms-key-id "$KMS_KEY" ${PROFILE:+--profile "$PROFILE"} 46 | echo "transferred $COUNT files" 47 | done 48 | } 49 | 50 | VPC_SUPPORT=$(aws --region "$REGION" ssm get-parameter --name "/SDLF/VPC/Enabled" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"} 2>/dev/null) 51 | if [ -z "$VPC_SUPPORT" ] 52 | then 53 | aws --region "$REGION" ssm put-parameter --name "/SDLF/VPC/Enabled" --value "false" --type String ${PROFILE:+--profile "$PROFILE"} 54 | fi 55 | 56 | aws cloudformation package --template-file "$DIRNAME"/scripts/legislators-glue-job.yaml \ 57 | --s3-bucket "$ARTIFACTS_BUCKET" \ 58 | ${PROFILE:+--profile "$PROFILE"} \ 59 | --output-template-file "$DIRNAME"/output/packaged-template.yaml 60 | 61 | STACK_NAME="sdlf-legislators-glue-job" 62 | aws cloudformation deploy \ 63 | --s3-bucket "$ARTIFACTS_BUCKET" --s3-prefix sdlf-utils \ 64 | --stack-name "$STACK_NAME" \ 65 | --template-file "$DIRNAME"/output/packaged-template.yaml \ 66 | --tags Framework=sdlf \ 67 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \ 68 | --region "$REGION" \ 69 | ${PROFILE:+--profile "$PROFILE"} || exit 1 70 | 71 | send_legislators 72 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from awsglue.context import GlueContext 4 | from awsglue.job import Job 5 | from awsglue.transforms import Join 6 | from awsglue.utils import getResolvedOptions 7 | from pyspark.context import SparkContext 8 | 9 | args = getResolvedOptions(sys.argv, ["JOB_NAME", "SOURCE_LOCATION", "OUTPUT_LOCATION"]) 10 | source = args["SOURCE_LOCATION"] 11 | destination = args["OUTPUT_LOCATION"] 12 | 13 | glueContext = GlueContext(SparkContext.getOrCreate()) 14 | job = Job(glueContext) 15 | job.init(args["JOB_NAME"], args) 16 | 17 | persons = glueContext.create_dynamic_frame.from_options( 18 | connection_type="s3", 19 | format="json", 20 | connection_options={"paths": ["{}/{}".format(source, "persons_parsed.json")]}, 21 | format_options={"withHeader": False}, 22 | transformation_ctx="path={}".format("persons_df"), 23 | ) 24 | 25 | memberships = glueContext.create_dynamic_frame.from_options( 26 | connection_type="s3", 27 | format="json", 28 | connection_options={"paths": ["{}/{}".format(source, "memberships_parsed.json")]}, 29 | format_options={"withHeader": False}, 30 | transformation_ctx="path={}".format("memberships_df"), 31 | ) 32 | 33 | organizations = ( 34 | glueContext.create_dynamic_frame.from_options( 35 | connection_type="s3", 36 | format="json", 37 | connection_options={"paths": ["{}/{}".format(source, "organizations_parsed.json")]}, 38 | format_options={"withHeader": False}, 39 | transformation_ctx="path={}".format("organizations_df"), 40 | ) 41 | .rename_field("id", "org_id") 42 | .rename_field("name", "org_name") 43 | ) 44 | 45 | history = Join.apply( 46 | organizations, Join.apply(persons, memberships, "id", "person_id"), "org_id", "organization_id" 47 | ).drop_fields(["person_id", "org_id"]) 48 | 49 | persons.toDF().write.mode("overwrite").parquet("{}/persons/".format(destination)) 50 | organizations.toDF().write.mode("overwrite").parquet("{}/organizations/".format(destination)) 51 | memberships.toDF().write.mode("overwrite").parquet("{}//memberships/".format(destination)) 52 | history.toDF().write.mode("overwrite").parquet("{}/history/".format(destination), partitionBy=["org_name"]) 53 | 54 | job.commit() 55 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Glue Job Sample 3 | 4 | Parameters: 5 | pPipelineDeploymentInstance: 6 | Type: String 7 | Description: specific pipeline stage deployment instance this job is for 8 | Default: mainB 9 | pArtifactsBucket: 10 | Description: S3 bucket used to store artifacts (from CICD or generated by data pipelines) 11 | Type: AWS::SSM::Parameter::Value 12 | Default: /sdlf/storage/rArtifactsBucket/dev 13 | pEnableVpc: 14 | Description: Deploy SDLF resources in a VPC 15 | Type: AWS::SSM::Parameter::Value 16 | Default: /SDLF/VPC/Enabled 17 | 18 | Conditions: 19 | RunInVpc: !Equals [!Ref pEnableVpc, true] 20 | 21 | Resources: 22 | rGlueRole: 23 | Type: AWS::IAM::Role 24 | Properties: 25 | Path: /service-role/ 26 | AssumeRolePolicyDocument: 27 | Version: 2012-10-17 28 | Statement: 29 | - Effect: Allow 30 | Principal: 31 | Service: 32 | - glue.amazonaws.com 33 | Action: 34 | - sts:AssumeRole 35 | ManagedPolicyArns: 36 | - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole 37 | - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess 38 | - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess 39 | Policies: 40 | - PolicyName: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job 41 | PolicyDocument: 42 | Version: 2012-10-17 43 | Statement: 44 | - Effect: Allow 45 | Action: 46 | - kms:CreateGrant 47 | - kms:Decrypt 48 | - kms:DescribeKey 49 | - kms:Encrypt 50 | - kms:GenerateDataKey* 51 | - kms:ReEncrypt* 52 | Resource: 53 | - "{{resolve:ssm:/sdlf/dataset/rKMSInfraKey/dev:1}}" 54 | - "{{resolve:ssm:/sdlf/dataset/rKMSDataKey/dev:1}}" 55 | - "{{resolve:ssm:/sdlf/storage/rKMSKey/dev:1}}" 56 | 57 | rGlueJob: 58 | Type: AWS::Glue::Job 59 | Properties: 60 | Command: 61 | Name: glueetl 62 | PythonVersion: "3" 63 | ScriptLocation: !Sub s3://${pArtifactsBucket}/artifacts/${pPipelineDeploymentInstance}-glue-job.py 64 | DefaultArguments: !If 65 | - RunInVpc 66 | - 67 | "--job-bookmark-option": job-bookmark-enable 68 | "--enable-metrics": "" 69 | "--disable-proxy-v2": "true" 70 | - 71 | "--job-bookmark-option": job-bookmark-enable 72 | "--enable-metrics": "" 73 | ExecutionProperty: 74 | MaxConcurrentRuns: 3 75 | MaxRetries: 0 76 | MaxCapacity: 2.0 77 | GlueVersion: "4.0" 78 | Name: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job 79 | SecurityConfiguration: "{{resolve:ssm:/sdlf/dataset/rGlueSecurityConfiguration/dev:1}}" 80 | Role: !Ref rGlueRole 81 | -------------------------------------------------------------------------------- /validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | shopt -s globstar 4 | 5 | # python 6 | ruff format --check . 7 | ruff check . 8 | # pylint $(git ls-files --exclude-standard '*.py') # pylint is disabled for now 9 | trivy fs --scanners vuln . 10 | 11 | # shell 12 | find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \ 13 | | xargs -0 shellcheck -x --format gcc 14 | 15 | # cloudformation 16 | cfn-lint ./**/*.yaml 17 | 18 | ## unfortunately cfn_nag doesn't support fn::foreach so we exclude files using it: https://github.com/stelligent/cfn_nag/issues/621 19 | find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \ 20 | | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path 21 | --------------------------------------------------------------------------------