├── .cfn-nag-deny-list.yml
├── .cfnlintrc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   ├── question.md
    │   └── support-the-sdlf.md
    └── workflows
    │   ├── static-checking.yml
    │   └── workshop-deployment.yml
├── .gitignore
├── .mkdocs.yml
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── deploy.sh
├── docs
    ├── _static
    │   ├── drawio
    │   │   ├── code-repositories-structure.drawio
    │   │   ├── datalake-architecture.drawio
    │   │   ├── sdlf-architecture-datalake.drawio
    │   │   ├── sdlf-architecture-datamesh.drawio
    │   │   ├── sdlf-dataset.drawio
    │   │   ├── sdlf-foundations.drawio
    │   │   ├── sdlf-in-a-nutshell.drawio
    │   │   ├── sdlf-monitoring.drawio
    │   │   ├── sdlf-pipeline.drawio
    │   │   ├── sdlf-stage-dataquality.drawio
    │   │   ├── sdlf-stage-glue.drawio
    │   │   ├── sdlf-stage-lambda.drawio
    │   │   └── sdlf-team.drawio
    │   ├── public-references.png
    │   ├── sail-icon.ico
    │   ├── sail-icon.png
    │   ├── sdlf-architecture-datalake.png
    │   ├── sdlf-architecture-datamesh.png
    │   ├── sdlf-cicd-gluejobsdeployer.png
    │   ├── sdlf-cicd.png
    │   ├── sdlf-dataset.png
    │   ├── sdlf-foundations.png
    │   ├── sdlf-in-a-nutshell.png
    │   ├── sdlf-layers-architecture.png
    │   ├── sdlf-logo.svg
    │   ├── sdlf-monitoring.png
    │   ├── sdlf-pipeline-full.png
    │   ├── sdlf-pipeline.png
    │   ├── sdlf-stage-dataquality.png
    │   ├── sdlf-stage-glue.png
    │   ├── sdlf-stage-lambda.png
    │   └── sdlf-team.png
    ├── architecture.md
    ├── constructs
    │   ├── cicd.md
    │   ├── dataset.md
    │   ├── foundations.md
    │   ├── index.md
    │   ├── monitoring.md
    │   ├── pipeline.md
    │   ├── stage-dataquality.md
    │   ├── stage-glue.md
    │   ├── stage-lambda.md
    │   └── team.md
    ├── index.md
    └── requirements.txt
├── pyproject.toml
├── sdlf-cicd
    ├── .gitignore
    ├── README.md
    ├── deploy-cicd.sh
    ├── deploy-role.sh
    ├── lambda
    │   ├── crossaccountteam-cicd
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── domain-cicd
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── parser-cicd
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   └── stagesrepositories-cicd
    │   │   └── src
    │   │       └── lambda_function.py
    ├── nested-stacks
    │   ├── template-cicd-cfn-module.yaml
    │   ├── template-cicd-glue-job.yaml
    │   ├── template-cicd-lambda-layer.yaml
    │   └── template-cicd-modules-pipelines.yaml
    ├── sam-translate.py
    ├── tags.json
    ├── template-cfn-module.yaml
    ├── template-cicd-domain-roles.yaml
    ├── template-cicd-domain-team-role.yaml
    ├── template-cicd-domain.yaml
    ├── template-cicd-generic-git.yaml
    ├── template-cicd-generic-role.yaml
    ├── template-cicd-prerequisites.yaml
    ├── template-cicd-sdlf-pipelines.yaml
    ├── template-cicd-sdlf-repositories.codecommit.yaml
    ├── template-cicd-sdlf-repositories.github.yaml
    ├── template-cicd-sdlf-repositories.gitlab.yaml
    ├── template-cicd-team-pipeline.yaml
    ├── template-cicd-team-repository.yaml
    ├── template-codecommit-pr-check.yaml
    ├── template-generic-cfn-module.yaml
    ├── template-generic-cfn-template.yaml
    ├── template-glue-job.part
    ├── template-glue-job.yaml
    └── template-lambda-layer.yaml
├── sdlf-datalakeLibrary
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── python
    │   ├── __init__.py
    │   └── datalake_library
    │   │   ├── commons.py
    │   │   ├── data_quality
    │   │       └── schema_validator.py
    │   │   ├── datalake_exceptions.py
    │   │   ├── interfaces
    │   │       ├── __init__.py
    │   │       ├── dynamo_interface.py
    │   │       ├── s3_interface.py
    │   │       ├── sqs_interface.py
    │   │       └── states_interface.py
    │   │   ├── requirements.txt
    │   │   └── sdlf
    │   │       ├── __init__.py
    │   │       ├── __version__.py
    │   │       ├── config.py
    │   │       ├── peh.py
    │   │       └── utils.py
    └── template-lambda-layer.yaml
├── sdlf-dataset
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── dataset.yaml
    │   └── template.yaml
├── sdlf-foundations
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── foundations.py
    │   ├── foundations.yaml
    │   ├── lambda
    │       ├── catalog-redrive
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── catalog
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── replicate
    │       │   └── src
    │       │       ├── event-create-delete-table.json
    │       │       ├── event-update-table.json
    │       │       └── lambda_function.py
    │   └── template.yaml
├── sdlf-monitoring
    ├── .gitignore
    ├── kibana
    │   ├── generic_dashboard.json
    │   └── generic_visualizations.json
    ├── lambda
    │   ├── cloudwatchlogs-transformer
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   └── topic
    │   │   └── src
    │   │       └── lambda_function.py
    └── template.yaml
├── sdlf-pipeline
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── pipeline.py
    │   ├── pipeline.yaml
    │   └── template.yaml
├── sdlf-stage-dataquality
    ├── .gitignore
    ├── lambda
    │   ├── initial-check
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-redrive
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   └── stage-routing
    │   │   └── src
    │   │       └── lambda_function.py
    ├── state-machine
    │   └── data-quality.asl.json
    └── template.yaml
├── sdlf-stage-ecsfargate
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── ecsfargate.py
    │   ├── lambda
    │       ├── error
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── postupdate-metadata
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── redrive
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── routing
    │       │   └── src
    │       │       └── lambda_function.py
    │   ├── state-machine
    │       └── stage-ecsfargate.asl.json
    │   └── template.yaml
├── sdlf-stage-emrserverless
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── emrserverless.py
    │   ├── lambda
    │       ├── error
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── postupdate-metadata
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── redrive
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── routing
    │       │   └── src
    │       │       └── lambda_function.py
    │   ├── stageemrserverless.yaml
    │   └── state-machine
    │       └── stage-emrserverless.asl.json
├── sdlf-stage-glue
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── glue.py
    │   ├── lambda
    │       ├── error
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── postupdate-metadata
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── redrive
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── routing
    │       │   └── src
    │       │       └── lambda_function.py
    │   ├── stageglue.yaml
    │   ├── state-machine
    │       └── stage-glue.asl.json
    │   └── template.yaml
├── sdlf-stage-lambda
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── awslambda.py
    │   ├── lambda
    │       ├── error
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── postupdate-metadata
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── process-object
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       ├── redrive
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── routing
    │       │   └── src
    │       │       └── lambda_function.py
    │   ├── stagelambda.yaml
    │   ├── state-machine
    │       └── stage-lambda.asl.json
    │   └── template.yaml
├── sdlf-stageA
    ├── .gitignore
    ├── lambda
    │   ├── stage-a-error
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-a-postupdate-metadata
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-a-preupdate-metadata
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-a-process-object
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-a-redrive
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   └── stage-a-routing
    │   │   └── src
    │   │       └── lambda_function.py
    ├── state-machine
    │   └── stage-a.asl.json
    └── template.yaml
├── sdlf-stageB
    ├── .gitignore
    ├── lambda
    │   ├── stage-b-error
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-b-fetch-metadata
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-b-postupdate-metadata
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   ├── stage-b-redrive
    │   │   └── src
    │   │   │   └── lambda_function.py
    │   └── stage-b-routing
    │   │   └── src
    │   │       └── lambda_function.py
    ├── state-machine
    │   └── stage-b.asl.json
    └── template.yaml
├── sdlf-team
    ├── .gitignore
    ├── README.md
    ├── buildspec.sh
    ├── pyproject.toml
    └── src
    │   ├── __init__.py
    │   ├── lambda
    │       ├── datasets-dynamodb
    │       │   └── src
    │       │   │   └── lambda_function.py
    │       └── pipelines-dynamodb
    │       │   └── src
    │       │       └── lambda_function.py
    │   ├── team.py
    │   ├── team.yaml
    │   └── template.yaml
├── sdlf-utils
    └── workshop-examples
    │   ├── 10-demo
    │       └── sdlf-workshop
    │       │   ├── dataset-legislators.yaml
    │       │   ├── foundations-datalake-dev.yaml
    │       │   ├── pipeline-main.yaml
    │       │   ├── tags.json
    │       │   └── team-datalake-engineering-dev.yaml
    │   ├── 10-deployment
    │       ├── sdlf-main-datalake-engineering
    │       │   ├── datasets.yaml
    │       │   ├── pipeline-main.yaml
    │       │   ├── pipelines.yaml
    │       │   └── tags.json
    │       └── sdlf-main
    │       │   ├── datadomain-datalake-dev.yaml
    │       │   ├── foundations-datalake-dev.yaml
    │       │   ├── tags.json
    │       │   └── team-datalake-engineering-dev.yaml
    │   ├── 20-production
    │       ├── sdlf-main-proserve-iot
    │       │   ├── datasets.yaml
    │       │   ├── pipeline-main.yaml
    │       │   ├── pipeline-singlestage.yaml
    │       │   ├── pipelines.yaml
    │       │   └── tags.json
    │       └── sdlf-main
    │       │   ├── datadomain-marketing-dev.yaml
    │       │   ├── datadomain-proserve-dev.yaml
    │       │   ├── foundations-marketing-dev.yaml
    │       │   ├── foundations-proserve-dev.yaml
    │       │   ├── tags.json
    │       │   ├── team-marketing-industry-dev.yaml
    │       │   └── team-proserve-iot-dev.yaml
    │   ├── clean-up.sh
    │   └── legislators
    │       ├── data
    │           ├── memberships.json
    │           ├── organizations.json
    │           ├── persons.json
    │           └── regions.json
    │       ├── deploy.sh
    │       └── scripts
    │           ├── legislators-glue-job.py
    │           └── legislators-glue-job.yaml
└── validate.sh


/.cfn-nag-deny-list.yml:
--------------------------------------------------------------------------------
1 | RulesToSuppress:
2 | - id: W76
3 |   reason: too experimental. https://stelligent.com/2020/03/27/thought-experiment-proposed-complexity-metric-for-iam-policy-documents/
4 | - id: W89
5 |   reason: SDLF does not support running in VPC by default
6 | - id: W92
7 |   reason: ReservedConcurrentExecutions
8 | 


--------------------------------------------------------------------------------
/.cfnlintrc:
--------------------------------------------------------------------------------
1 | ignore_templates:
2 |   - "sdlf-utils/workshop-examples/10-demo/sdlf-workshop/*.yaml"
3 | include_checks:
4 |   - I
5 | ignore_checks:
6 |   - W3002
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: cnfait
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **SDLF release (if known):**
27 | E.g. 1.5.2
28 | 
29 | **Additional context**
30 | Add any other context about the problem here.
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: cnfait
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask us a question
 4 | title: ''
 5 | labels: question
 6 | assignees: cnfait
 7 | 
 8 | ---
 9 | 
10 | Please be as specific as possible
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/support-the-sdlf.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Support the SDLF
 3 | about: Add your organisation's name or logo to the SDLF GitHub read.me
 4 | title: "[Support the SDLF]: <your organisation name>"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Thank you for letting us use your organisation's name on the SDLF read.me page and letting other customers know that you support the project!  If you would like us to also display your organisation's logo. please raise a linked pull request to provide an image file for the logo.
11 | 
12 | Please add any files to *docs/source/_static/*
13 | 
14 | Organisation Name: 
15 | Your Name:
16 | Your Position:
17 | I have included a logo: y/n
18 | 
19 | *By raising a Support the SDLF issue (and related pull request), you are granting AWS permission to use your company’s name (and logo) for the limited purpose described here and you are confirming that you have authority to grant such permission.*
20 | 


--------------------------------------------------------------------------------
/.github/workflows/static-checking.yml:
--------------------------------------------------------------------------------
 1 | name: Static Checking
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   cfn:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Set up Python 3.12
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: 3.12
20 |       - name: Set up Ruby 3.2
21 |         uses: ruby/setup-ruby@v1
22 |         with:
23 |           ruby-version: 3.2
24 |       - name: install requirements
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install "cfn-lint<1"
28 |           gem install cfn-nag
29 |       - name: cfn-lint
30 |         run: |
31 |           shopt -s globstar 
32 |           cfn-lint ./**/*.yaml
33 |       - name: cfn-nag
34 |         run: |
35 |           cat <<EOT >> .cfn-nag-deny-list.yml
36 |           - id: W61
37 |             reason: |-
38 |               Certificates are handled by customers downstream, see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-encryption-enable.html#emr-encryption-certificates
39 |               This is ignored only during CI as we want customers to be aware they need to update the security configuration should they choose to use it.
40 |           EOT
41 |           find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \
42 |           | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path
43 |   python:
44 |     runs-on: ubuntu-latest
45 |     steps:
46 |       - uses: actions/checkout@v4
47 |       - name: Set up Python 3.12
48 |         uses: actions/setup-python@v5
49 |         with:
50 |           python-version: 3.12
51 |       - name: install requirements
52 |         run: |
53 |           python -m pip install --upgrade pip
54 |           python -m pip install ruff
55 |       - name: ruff format
56 |         run: ruff format --check .
57 |       - name: ruff
58 |         run: ruff check --output-format github .
59 |   shellcheck:
60 |     runs-on: ubuntu-latest
61 |     steps:
62 |       - uses: actions/checkout@v4
63 |       - name: install requirements
64 |         run: |
65 |           sudo apt update
66 |           sudo apt install shellcheck
67 |       - name: shellcheck
68 |         run: |
69 |           find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \
70 |           | xargs -0 shellcheck -x --format gcc
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Editors
  2 | .vscode/
  3 | *.code-workspace
  4 | .idea/
  5 | .devcontainer/
  6 | 
  7 | # Mac/OSX
  8 | .DS_Store
  9 | 
 10 | # Windows
 11 | Thumbs.db
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Misc
 22 | automated-deployment/
 23 | rpdk.log
 24 | 
 25 | # Distribution / packaging
 26 | output/
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # cdk
158 | cdk.out
159 | cdk.context.json
160 | 


--------------------------------------------------------------------------------
/.mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: AWS Serverless Data Lake Framework
 2 | site_url: https://sdlf.readthedocs.io
 3 | repo_url: https://github.com/awslabs/aws-serverless-data-lake-framework
 4 | copyright: Amazon Web Services, Inc. All Rights Reserved.
 5 | theme:
 6 |   name: material
 7 |   features:
 8 |     - navigation.tabs
 9 |     - navigation.tabs.sticky
10 |     - toc.integrate
11 |     - navigation.indexes
12 |     - navigation.path
13 |   logo: _static/sdlf-logo.svg
14 |   favicon: _static/sail-icon.ico
15 | markdown_extensions:
16 |   - admonition
17 |   - attr_list
18 |   - tables
19 | plugins:
20 |   - search
21 | nav:
22 |   - index.md
23 |   - architecture.md
24 |   - Constructs:
25 |     - constructs/index.md
26 |     - constructs/foundations.md
27 |     - constructs/team.md
28 |     - constructs/dataset.md
29 |     - constructs/pipeline.md
30 |     - constructs/stage-lambda.md
31 |     - constructs/stage-glue.md
32 |     - constructs/stage-dataquality.md
33 |     - constructs/monitoring.md
34 |     - constructs/cicd.md
35 |   - 'Workshop': 'https://sdlf.workshop.aws/'
36 |   - 'License': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/LICENSE'
37 |   - 'Contributing': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/CONTRIBUTING.md'
38 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.12"
 7 | 
 8 | mkdocs:
 9 |   configuration: .mkdocs.yml
10 |   fail_on_warning: true
11 | 
12 | python:
13 |   install:
14 |     - requirements: docs/requirements.txt
15 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/docs/_static/drawio/sdlf-dataset.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="drawio.corp.amazon.com" modified="2024-06-06T09:55:45.612Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" etag="ddFnYnxN9tGOaJsYukF2" version="21.7.4" type="device">
 2 |   <diagram name="Page-1" id="lUmo0eGD1-4P-tSZeQlJ">
 3 |     <mxGraphModel dx="1114" dy="693" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
 4 |       <root>
 5 |         <mxCell id="0" />
 6 |         <mxCell id="1" parent="0" />
 7 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-1" value="Glue Database" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#8C4FFF;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.glue;" vertex="1" parent="1">
 8 |           <mxGeometry x="80" y="40" width="78" height="78" as="geometry" />
 9 |         </mxCell>
10 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-2" value="Lake Formation&lt;br&gt;Team Tag Association" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#8C4FFF;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.lake_formation;" vertex="1" parent="1">
11 |           <mxGeometry x="80" y="320" width="78" height="78" as="geometry" />
12 |         </mxCell>
13 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-3" value="Glue Crawler" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#8C4FFF;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.glue_crawlers;" vertex="1" parent="1">
14 |           <mxGeometry x="80" y="190" width="78" height="78" as="geometry" />
15 |         </mxCell>
16 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-4" value="Glue Crawler&lt;br&gt;Lake Formation Permissions" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#8C4FFF;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.lake_formation;" vertex="1" parent="1">
17 |           <mxGeometry x="240" y="190" width="78" height="78" as="geometry" />
18 |         </mxCell>
19 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-5" value="For All Resources&lt;br&gt;SSM Parameter" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#E7157B;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.parameter_store;" vertex="1" parent="1">
20 |           <mxGeometry x="400" y="40" width="75" height="78" as="geometry" />
21 |         </mxCell>
22 |         <mxCell id="PWP0A8HhWE3OCh_AXD60-6" value="Dataset&lt;br&gt;SSM Parameter" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#E7157B;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.parameter_store;" vertex="1" parent="1">
23 |           <mxGeometry x="240" y="40" width="75" height="78" as="geometry" />
24 |         </mxCell>
25 |       </root>
26 |     </mxGraphModel>
27 |   </diagram>
28 | </mxfile>
29 | 


--------------------------------------------------------------------------------
/docs/_static/drawio/sdlf-monitoring.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="drawio.corp.amazon.com" modified="2024-06-10T09:40:13.436Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0" etag="UFrllnrEq_XKiHuapWPo" version="21.7.4" type="device">
 2 |   <diagram name="Page-1" id="mgsjUvBNnKtcCD5DwDaC">
 3 |     <mxGraphModel dx="865" dy="550" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
 4 |       <root>
 5 |         <mxCell id="0" />
 6 |         <mxCell id="1" parent="0" />
 7 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-1" value="CloudTrail&lt;br&gt;S3 Bucket" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#7AA116;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.s3;" vertex="1" parent="1">
 8 |           <mxGeometry x="160" y="160" width="78" height="78" as="geometry" />
 9 |         </mxCell>
10 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-2" value="CloudTrail&lt;br&gt;CloudWatch Log Group" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#E7157B;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.cloudwatch_logs;" vertex="1" parent="1">
11 |           <mxGeometry x="290" y="160" width="78" height="58" as="geometry" />
12 |         </mxCell>
13 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-3" value="CloudTrail&lt;br&gt;IAM Role" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#DD344C;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.role;" vertex="1" parent="1">
14 |           <mxGeometry x="420" y="167" width="78" height="44" as="geometry" />
15 |         </mxCell>
16 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-4" value="CloudTrail" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#E7157B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.cloudtrail;" vertex="1" parent="1">
17 |           <mxGeometry x="160" y="40" width="78" height="78" as="geometry" />
18 |         </mxCell>
19 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-5" value="CloudWatch Logs&lt;br&gt;Kinesis Firehose" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;fillColor=#8C4FFF;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.kinesis_data_firehose;" vertex="1" parent="1">
20 |           <mxGeometry x="160" y="320" width="78" height="78" as="geometry" />
21 |         </mxCell>
22 |         <mxCell id="wEyBJHoS2qXL0eGxPFIb-6" value="S3 Storage Lens" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#7AA116;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.s3_storage_lens;" vertex="1" parent="1">
23 |           <mxGeometry x="560" y="40" width="78" height="78" as="geometry" />
24 |         </mxCell>
25 |       </root>
26 |     </mxGraphModel>
27 |   </diagram>
28 | </mxfile>
29 | 


--------------------------------------------------------------------------------
/docs/_static/drawio/sdlf-pipeline.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="drawio.corp.amazon.com" modified="2020-10-14T14:28:46.688Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" etag="0O7VWVmXhGeCV2C4elNS" version="12.4.8" type="device"><diagram id="DqQTqaZEN3xuNoOsdPku" name="Page-1">7VrbctowEP0aZtqHMMgyt0duucyQPpTOJHliNLawXWzLlUWAfn1XWAZsCZJ2uCZ9CdZKtrTnHO1q7VRwL1rccZL4j8ylYcWquYsK7lcsq2E34K80LDNDE6HM4PHAzUxbhlHwmypjTVlngUvTwkDBWCiCpGh0WBxTRxRshHM2Lw6bsLA4a0I8qhlGDgl161PgCl9ZUaO96bingeerqVtWM+uISD5YeZL6xGXzLRMeVHCPMyayq2jRo6HELsclu+92R+96YZzG4j03kF/93qBPX57Hj+i50ZxMn6f3N1b2lFcSzpTDarFimSPA2Sx2qXxIrYK7cz8QdJQQR/bOgXKw+SIKoYXgchKEYY+FjEM7ZjEM6qoZKBd0sXPpaA0ICImyiAq+hCHqhmY9u0NpyMIK0vmGEbumbP4WGXZTGYlSgbd+9AYouFBY/QVuWMNt1B/eSqeChIYBOF6GEbwXRaxSwdmUltAyAEjCwIuh6QBaFOxdiWUAEu2ojihwXTmNkZwifUcgA+lktAxcWMeiwtao6DyNwDASNIGf21nsiIDFqVwi49NJCJuwTA6bCUlabx1EJFYeJ24AoJTo2GLQhPeExUKFMWTlbTWTfCqEgUReRwtPBswqmad21QOWktWUDxDIjL1juByn4NN4kns0XvtTFlPFwnW73m3YZkWVBSRYsqWzkE6kUlPwK4i94arVxzXli2kKl6T+ISWGyhqrtzWNWaYNj9roSCpr6RteQOjvgO0LrXpV+GXc8SnwQASVYnOJIDIPsFcaSb8h+XAWwc/3ztMqi0mN/ujcDb5+3FiBaqhajt06ldiunzBe5PFKj92pZPRCyeBMELnpoXnTPhQ7Fi5wgw3bzBTK0bGosfS0qtFBXTifqSbjwmcei0k42FhLwG3GDJkMcysGf1IhlipKkxlsxQK/2Zxyov3AwrrYjDt0n9hUdhKEe1Tsc90yU8VpCLy/Fldy+D2hJ1GVPpFGwAbeXSnw6OfBdXDIhWs4gyBkOoS0j6Xc5pULNd9pbwu1cU6h5svUhbqvcLkcoRoi7GmFihq7s5/E8SKT30GYaBXPIe0zly2WgYhMyfgalGyfPeTiugbg/8LvgxV+tmGXnrjww6Z9CnVC9x8qv1W5l9d+nW+d4cuPh97oA9d/TVwq/9Zv5Aqv7tAJ4y5u7k6An7v8q78zJR6t/MP6SxaNjus6VecvO948Vds7qDrNqTpf5rWUf6hdFK6NznwWsWsaTtcl1BzBt4WKzynUfJnXUv6VhWqIsCcWqqF+/hTlH7aKTFh6yDjtVyvDF8RLLv9KSm4cMeRCc/NVfNW39a8FePAH</diagram></mxfile>


--------------------------------------------------------------------------------
/docs/_static/public-references.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/public-references.png


--------------------------------------------------------------------------------
/docs/_static/sail-icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sail-icon.ico


--------------------------------------------------------------------------------
/docs/_static/sail-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sail-icon.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-architecture-datalake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-architecture-datalake.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-architecture-datamesh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-architecture-datamesh.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-cicd-gluejobsdeployer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-cicd-gluejobsdeployer.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-cicd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-cicd.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-dataset.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-foundations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-foundations.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-in-a-nutshell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-in-a-nutshell.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-layers-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-layers-architecture.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <svg version="1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xl="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" viewBox="351 242 347 376" width="347" height="376">
 4 |   <defs/>
 5 |   <metadata> Produced by OmniGraffle 7.17.5\n2020-11-27 16:09:02 +0000</metadata>
 6 |   <g id="Canvas_1" fill-opacity="1" stroke-opacity="1" fill="none" stroke="none" stroke-dasharray="none">
 7 |     <title>Canvas 1</title>
 8 |     <g id="Canvas_1_Layer_2">
 9 |       <title>Layer 2</title>
10 |       <g id="Graphic_45">
11 |         <path d="M 575.0312 268.49724 C 547.2116 344.46214 473.7149 490.19713 391.42046 540 C 450.33476 558.2467 497.22296 546.30665 537.24574 531.7392 C 566.9218 460.2562 591.1029 350.0677 575.0312 268.49724 Z M 556.1919 524.5644 C 598.76325 508.07223 633.7411 493.89824 668.4205 521 C 689.9305 443.2444 680.0834 327.97545 583.42046 243 C 583.3718 243.1741 583.3226 243.3491 583.2729 243.525 C 592.6095 301.44098 605.5915 398.8676 556.19186 524.5644 Z" fill="#232f3e"/>
12 |       </g>
13 |       <g id="Graphic_51">
14 |         <path d="M 403.644 586.7557 C 404.8622 587.54964 405.94114 588.17515 406.8261 588.5991 L 429.35077 588.78774 C 423.2892 586.63576 417.7798 585.1886 413.42046 585.1886 C 410.58225 585.1886 407.2509 585.7802 403.644 586.7557 Z M 531.12555 589.6401 L 559.40094 589.8769 C 554.6238 587.6799 550.02124 586.1886 545.92046 586.1886 C 541.9166 586.1886 536.79954 587.57714 531.12555 589.6401 Z M 632.48544 590.489 L 651.8183 590.6509 C 654.2402 589.06395 656.47355 587.41524 658.5329 585.6628 C 655.61645 584.74736 652.8569 584.1886 650.4205 584.1886 C 645.6527 584.1886 639.28745 586.9041 632.48544 590.489 Z M 668.2643 574.6091 C 674.95264 564.2003 678.5701 550.0103 680.4918 528.044 C 638.4809 563.78294 433.1326 560.5538 375.69326 553.1289 L 375.69326 553.1289 C 375.96106 559.4878 382.1036 567.6804 389.07303 574.6537 C 396.6924 571.7719 405.0285 569.6886 412.92046 569.6886 C 434.5531 569.6886 465.6714 587 481.42046 587 C 497.1695 587 522.2031 569.87025 545.3215 569.87025 C 568.4399 569.87025 575.0604 588.1886 595.92046 588.1886 C 616.7805 588.1886 636.5486 568.1886 649.9205 568.1886 C 655.15975 568.1886 661.8677 570.99386 668.2643 574.6091 Z" fill="#232f3e"/>
15 |       </g>
16 |       <g id="Graphic_38">
17 |         <path d="M 361.92046 596.6886 C 366.3183 591.8413 391.2878 576.1886 412.92046 576.1886 C 434.5531 576.1886 465.6714 593.5 481.42046 593.5 C 497.1695 593.5 522.2031 576.37025 545.3215 576.37025 C 568.4399 576.37025 575.0604 594.6886 595.92046 594.6886 C 616.7805 594.6886 636.5486 574.6886 649.9205 574.6886 C 663.2923 574.6886 686.2308 592.9615 689.1429 596.3364 C 692.055 599.7113 696.6646 606.1735 692.9205 607.6886 C 689.1764 609.2037 664.446 590.6886 650.4205 590.6886 C 636.3949 590.6886 608.54424 614.1886 596.42046 614.1886 C 584.2967 614.1886 561.4912 592.6886 545.92046 592.6886 C 530.3497 592.6886 497.9421 613.6886 481.42046 613.6886 C 464.89884 613.6886 430.4175 591.6886 413.42046 591.6886 C 396.4234 591.6886 361.74127 612.9053 356.0954 610.7253 C 350.44953 608.5453 357.5226 601.53595 361.92046 596.6886 Z" fill="#232f3e"/>
18 |         <path d="M 361.92046 596.6886 C 366.3183 591.8413 391.2878 576.1886 412.92046 576.1886 C 434.5531 576.1886 465.6714 593.5 481.42046 593.5 C 497.1695 593.5 522.2031 576.37025 545.3215 576.37025 C 568.4399 576.37025 575.0604 594.6886 595.92046 594.6886 C 616.7805 594.6886 636.5486 574.6886 649.9205 574.6886 C 663.2923 574.6886 686.2308 592.9615 689.1429 596.3364 C 692.055 599.7113 696.6646 606.1735 692.9205 607.6886 C 689.1764 609.2037 664.446 590.6886 650.4205 590.6886 C 636.3949 590.6886 608.54424 614.1886 596.42046 614.1886 C 584.2967 614.1886 561.4912 592.6886 545.92046 592.6886 C 530.3497 592.6886 497.9421 613.6886 481.42046 613.6886 C 464.89884 613.6886 430.4175 591.6886 413.42046 591.6886 C 396.4234 591.6886 361.74127 612.9053 356.0954 610.7253 C 350.44953 608.5453 357.5226 601.53595 361.92046 596.6886 Z" stroke="white" stroke-opacity="0" stroke-linecap="round" stroke-linejoin="round" stroke-width="6"/>
19 |       </g>
20 |     </g>
21 |   </g>
22 | </svg>
23 | 


--------------------------------------------------------------------------------
/docs/_static/sdlf-monitoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-monitoring.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-pipeline-full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-pipeline-full.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-pipeline.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-stage-dataquality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-dataquality.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-stage-glue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-glue.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-stage-lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-stage-lambda.png


--------------------------------------------------------------------------------
/docs/_static/sdlf-team.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/docs/_static/sdlf-team.png


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
 1 | # Architecture
 2 | 
 3 | SDLF supports both a centralized datalake deployment pattern and decentralized data domains which could be used as a basis for a [data mesh](https://aws.amazon.com/what-is/data-mesh/) deployment pattern.
 4 | 
 5 | ## Centralized Data Lake
 6 | 
 7 | ![Centralized Data Lake Architecture](_static/sdlf-architecture-datalake.png)
 8 | 
 9 | !!! warning
10 |     We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation.
11 | 
12 | ## Data Mesh
13 | 
14 | The Data Mesh pattern is fundamentally about decentralized data ownership, with data owned by specialized domain teams rather than a centralized data team. This usually means:
15 | 
16 | - each data domain team has its own dedicated data infrastructure, for production and/or consumption
17 | - each data domain team is able to deploy the tools and infrastructure it needs - a self-serve data platform
18 | 
19 | A governance layer is federating data assets in a business catalog to ensure compliance against policies and standards, and ease of data sharing across teams.
20 | 
21 | As such, it can be seen as a collection of data domain-specific datalakes deployed with SDLF. [Amazon SageMaker Data and AI Governance](https://aws.amazon.com/sagemaker/data-ai-governance/) (built on Amazon DataZone) can be used for the governance layer.
22 | 
23 | ![Data Mesh Architecture](_static/sdlf-architecture-datamesh.png)
24 | 
25 | !!! warning
26 |     We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation.
27 | 
28 | ## Transactional Data Lake
29 | 
30 | Using [Iceberg](https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/introduction.html).
31 | 
32 | !!! warning
33 |     We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation.
34 | 


--------------------------------------------------------------------------------
/docs/constructs/dataset.md:
--------------------------------------------------------------------------------
 1 | # sdlf-dataset
 2 | 
 3 | !!! note
 4 |     `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Dataset](../_static/sdlf-dataset.png)
 9 | 
10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset.
11 | 
12 | Examples of datasets are:
13 | 
14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables)
15 | - A group of files from a data source (e.g. XML files from a Telemetry system)
16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3)
17 | 
18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler.
19 | 
20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules.
21 | 
22 | ## Usage
23 | 
24 | ### CloudFormation with [sdlf-cicd](cicd.md)
25 | 
26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
27 | 
28 | ```
29 | rExample:
30 |     Type: awslabs::sdlf::dataset::MODULE
31 |     Properties:
32 |         pPipelineReference: !Ref pPipelineReference
33 |         pTeamName: iot
34 |         pDatasetName: legislators
35 | ```
36 | 
37 | ## Interface
38 | 
39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters:
40 | 
41 | | SSM Parameter                             | Description                                  | Comment                                      |
42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- |
43 | | `/SDLF/Datasets/{team}/{dataset}`         | Dataset-specific metadata for data pipelines |                                              |
44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler                    |                                              |
45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog"               |                                              |
46 | 


--------------------------------------------------------------------------------
/docs/constructs/monitoring.md:
--------------------------------------------------------------------------------
 1 | # sdlf-monitoring
 2 | 
 3 | !!! note
 4 |     `sdlf-monitoring` is defined in the [sdlf-monitoring](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-monitoring) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Monitoring](../_static/sdlf-monitoring.png)
 9 | 
10 | CloudTrail (Auditing) and S3 Storage Lens are resources implemented in the framework. They are deployed once only and are consumed by all systems and users across the lake.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rProserveMonitoring:
20 |     Type: awslabs::sdlf::monitoring::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pCloudtrailEnabled: true
24 | ```
25 | 
26 | ## Interface
27 | 
28 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-monitoring` publishes the following parameters:
29 | 
30 | | SSM Parameter                              | Description                                                      | Comment                                      |
31 | | ------------------------------------------ | ---------------------------------------------------------------- | -------------------------------------------- |
32 | | `/SDLF/S3/CloudTrailBucket`                | Name of CloudTrail S3 bucket                                     |                                              |
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/constructs/stage-dataquality.md:
--------------------------------------------------------------------------------
 1 | # sdlf-stage-dataquality
 2 | 
 3 | !!! note
 4 |     `sdlf-stage-dataquality` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stage-dataquality) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Stage Data Quality](../_static/sdlf-stage-dataquality.png)
 9 | 
10 | Create a Glue Data Quality ruleset from recommendations then apply this ruleset to a given Glue table.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rMainDq:
20 |     Type: proserve::iot::dataquality::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pStageName: DQ
24 |         pPipeline: main
25 |         pTeamName: iot
26 |         pTriggerType: event
27 |         pEventPattern: !Sub >-
28 |             {
29 |                 "source": ["aws.states"],
30 |                 "detail-type": ["Step Functions Execution Status Change"],
31 |                 "detail": {
32 |                     "status": ["SUCCEEDED"],
33 |                     "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-b"]
34 |                 }
35 |             }
36 |         pEnableTracing: false
37 | ```
38 | 
39 | ## Interface
40 | 
41 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-dataquality` publishes the following parameters:
42 | 
43 | | SSM Parameter                                        | Description                                                      | Comment                                      |
44 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- |
45 | | `/SDLF/SM/{team}/{pipeline}{stage}SM`                | Step Function                                                    |                                              |
46 | 


--------------------------------------------------------------------------------
/docs/constructs/stage-glue.md:
--------------------------------------------------------------------------------
 1 | # sdlf-stage-glue (sdlf-stageB)
 2 | 
 3 | !!! note
 4 |     `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png)
 9 | 
10 | Run a Glue job.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rMainB:
20 |     Type: awslabs::sdlf::stageB::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}"
24 |         pStageName: B
25 |         pPipeline: main
26 |         pTeamName: iot
27 |         pTriggerType: schedule
28 |         pEventPattern: !Sub >-
29 |             {
30 |                 "source": ["aws.states"],
31 |                 "detail-type": ["Step Functions Execution Status Change"],
32 |                 "detail": {
33 |                     "status": ["SUCCEEDED"],
34 |                     "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"]
35 |                 }
36 |             }
37 |         pSchedule: "cron(*/5 * * * ? *)"
38 |         pEnableTracing: false
39 | ```
40 | 
41 | ## Interface
42 | 
43 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters:
44 | 
45 | | SSM Parameter                                        | Description                                                      | Comment                                      |
46 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- |
47 | | `/SDLF/SM/{team}/{pipeline}{stage}SM`                | Name of the DynamoDB used to store mappings to transformation    |                                              |
48 | 


--------------------------------------------------------------------------------
/docs/constructs/stage-lambda.md:
--------------------------------------------------------------------------------
 1 | # sdlf-stage-lambda (sdlf-stageA)
 2 | 
 3 | !!! note
 4 |     `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png)
 9 | 
10 | Run a Lambda function.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rMainA:
20 |     Type: awslabs::sdlf::stageA::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pStageName: A
24 |         pPipeline: main
25 |         pTeamName: iot
26 |         pTriggerType: event
27 |         pEventPattern: >-
28 |             {
29 |                 "source": ["aws.s3"],
30 |                 "detail-type": ["Object Created"],
31 |                 "detail": {
32 |                     "bucket": {
33 |                         "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"]
34 |                     },
35 |                     "object": {
36 |                         "key": [{ "prefix": "iot/legislators/" }]
37 |                     }
38 |                 }
39 |             }
40 |         pEnableTracing: false
41 | ```
42 | 
43 | ## Interface
44 | 
45 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters:
46 | 
47 | | SSM Parameter                                        | Description                                                      | Comment                                      |
48 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- |
49 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda                                                   |                                              |
50 | | `/SDLF/SM/{team}/{pipeline}{stage}SM`                | Step Functions                                                   |                                              |
51 | 


--------------------------------------------------------------------------------
/docs/constructs/team.md:
--------------------------------------------------------------------------------
 1 | # sdlf-team
 2 | 
 3 | !!! note
 4 |     `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"}
 9 | 
10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies.
11 | 
12 | As such `sdlf-team` is mostly about permissions.
13 | 
14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`.
15 | 
16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules.
17 | 
18 | !!! warning
19 |     The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members.
20 | 
21 | ## Usage
22 | 
23 | ### CloudFormation with [sdlf-cicd](cicd.md)
24 | 
25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
26 | 
27 | ```
28 | rExample:
29 |     Type: awslabs::sdlf::team::MODULE
30 |     Properties:
31 |         pPipelineReference: !Ref pPipelineReference
32 |         pTeamName: industry
33 |         pEnvironment: dev
34 |         pSNSNotificationsEmail: nobody@amazon.com
35 | ```
36 | 
37 | ## Interface
38 | 
39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters:
40 | 
41 | | SSM Parameter                                     | Description                                                     | Comment                                      |
42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- |
43 | | `/SDLF/Athena/{team}/WorkgroupName`               | Team Athena workgroup name                                      |                                              |
44 | | `/SDLF/EventBridge/{team}/EventBusName`           | Name of the team dedicated event bus                            |                                              |
45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName`      | Name of the team dedicated schedule group                       |                                              |
46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name                                |                                              |
47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn`           | IAM Role ARN for Glue crawlers                                  |                                              |
48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary`  | ARN of the permissions boundary IAM Managed policy for the team |                                              |
49 | | `/SDLF/KMS/${pTeamName}/DataKeyId`                | ARN of the team KMS data key                                    |                                              |
50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId`               | ARN of the team KMS infrastructure key                          |                                              |
51 | | `/SDLF/SNS/${pTeamName}/Notifications`            | ARN of the team-specific SNS Topic                              |                                              |
52 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # AWS Serverless Data Lake Framework
 2 | 
 3 | Serverless Data Lake Framework (SDLF) is a collection of reusable artifacts aimed at accelerating the delivery of enterprise data lakes on AWS, shortening the deployment time to production from several months to a few weeks. It can be used by AWS teams, partners and customers to implement the foundational structure of a data lake following best practices.
 4 | 
 5 | ## Motivation
 6 | 
 7 | A data lake gives your organization agility. It provides a repository where consumers can quickly find the data they need and use it in their business projects. However, building a data lake can be complex; there's a lot to think about beyond the storage of files. For example, how do you catalog the data so you know what you've stored? What ingestion pipelines do you need? How do you manage data quality? How do you keep the code for your transformations under source control? How do you manage development, test and production environments? Building a solution that addresses these use cases can take many weeks and this time can be better spent innovating with data and achieving business goals.
 8 | 
 9 | SDLF is a collection of production-hardened, best-practices templates which accelerate your data lake implementation journey on AWS, so that you can focus on use cases that generate value for business.
10 | 
11 | ## Major Features
12 | 
13 | At a high level, SDLF is an infrastructure-as-code framework that enables customers to create:
14 | 
15 | - End-to-end data architectures such as a centralized (transactional) data lake or a data mesh
16 | - Foundational data lake assets (e.g. Amazon S3 buckets for data storage)
17 | - Event-driven jobs that orchestrate the transformation of data, storing the output in a new location on S3
18 | - Data processing stages using AWS serverless services such as Lambda or Glue
19 | - Git-driven deployment pipelines (CICD) for the entire data infrastructure
20 | 
21 | Using all SDLF features as illustrated in the [official workshop](https://sdlf.workshop.aws/) gives you:
22 | 
23 | 1. **Traceability and version control**:
24 |     - SDLF is entirely managed through CICD pipelines. At no point is interaction with the AWS console necessary (in fact it's discouraged).
25 |     - Using version control ensures that any change to the data lake is scrutinized before it enters production.
26 | 
27 | 2. **Scalability and reproducibility**:
28 |     - Deploying and tearing down a customized, production-grade data lake can be done in minutes and across multiple accounts and environments.
29 |     - This is in comparison to a manual approach which would be tedious, slow, prone to errors and unable to scale.
30 | 
31 | 3. **Best practices**:
32 |     - Best practices acquired through dozens of implementations in production are enforced in the framework.
33 |     - Features such as monitoring (S3 Storage Lens, Cloudtrail), encryption (KMS), alerting (Cloudwatch alarms), data permissions (Lake Formation) and many more are baked in SDLF so you don't have to reinvent the wheel.
34 | 
35 | ## Public References
36 | 
37 | ![SDLF Public References](_static/public-references.png)


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs==1.6.1
2 | mkdocs-material==9.5.36
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | extend-exclude = ["sdlf-cicd/sam-translate.py"]
 3 | line-length = 120
 4 | target-version = "py312"
 5 | 
 6 | [tool.ruff.lint]
 7 | extend-select = ["I", "PL", "W"]
 8 | ignore = ["PLR0912", "PLR0913", "PLR0915"]
 9 | fixable = ["I001", "W291"]
10 | 
11 | [tool.pylint.main]
12 | py-version = "3.12"
13 | ignore-paths = ["^/sdlf-cicd/sam-translate.py"]
14 | jobs = 0
15 | 
16 | [tool.pylint.format]
17 | max-line-length = 120
18 | max-module-lines = 1500
19 | 
20 | [tool.pylint.logging]
21 | # The type of string formatting that logging methods do. `old` means using %
22 | # formatting, `new` is for `{}` formatting.
23 | logging-format-style = "new"
24 | 
25 | # Logging modules to check that the string format arguments are in logging
26 | # function parameter format.
27 | logging-modules = ["logging", "datalake_library.commons"]
28 | 
29 | [tool.pylint.similarities]
30 | min-similarity-lines = 10
31 | 


--------------------------------------------------------------------------------
/sdlf-cicd/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv
22 | 


--------------------------------------------------------------------------------
/sdlf-cicd/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-cicd/README.md


--------------------------------------------------------------------------------
/sdlf-cicd/lambda/stagesrepositories-cicd/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import boto3
 5 | 
 6 | logger = logging.getLogger()
 7 | logger.setLevel(logging.INFO)
 8 | 
 9 | codecommit_endpoint_url = "https://codecommit." + os.getenv("AWS_REGION") + ".amazonaws.com"
10 | codecommit = boto3.client("codecommit", endpoint_url=codecommit_endpoint_url)
11 | codepipeline_endpoint_url = "https://codepipeline." + os.getenv("AWS_REGION") + ".amazonaws.com"
12 | codepipeline = boto3.client("codepipeline", endpoint_url=codepipeline_endpoint_url)
13 | 
14 | 
15 | def lambda_handler(event, context):
16 |     try:
17 |         sdlf_stage_repositories = []
18 |         next_token = None
19 |         while True:
20 |             if next_token:
21 |                 response = codecommit.list_repositories(nextToken=next_token)
22 |             else:
23 |                 response = codecommit.list_repositories()
24 |             repos = response["repositories"]
25 |             sdlf_stage_repositories.extend(
26 |                 [
27 |                     repo["repositoryName"]
28 |                     for repo in repos
29 |                     if repo["repositoryName"].startswith(os.getenv("STAGES_REPOSITORIES_PREFIX"))
30 |                 ]
31 |             )
32 |             next_token = response.get("nextToken")
33 |             if not next_token:
34 |                 break
35 | 
36 |         logger.info("sdlf_stage_repositories: %s", sdlf_stage_repositories)
37 | 
38 |     except Exception as e:
39 |         message = "Function exception: " + str(e)
40 |         codepipeline.put_job_failure_result(
41 |             jobId=event["CodePipeline.job"]["id"],
42 |             failureDetails={"message": message, "type": "JobFailed"},
43 |         )
44 |         raise
45 | 
46 |     codepipeline.put_job_success_result(
47 |         jobId=event["CodePipeline.job"]["id"],
48 |         outputVariables={
49 |             "StagesRepositories": ",".join(sdlf_stage_repositories),
50 |             "StagesRepositoriesCount": ",".join(list(map(str, range(0, len(sdlf_stage_repositories))))),
51 |         },
52 |     )
53 |     return "Success"
54 | 


--------------------------------------------------------------------------------
/sdlf-cicd/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |       "Framework" : "sdlf"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-cfn-module.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Deploy a CloudFormation module
 3 | 
 4 | Parameters:
 5 |   pArtifactsBucket:
 6 |     Description: The artifacts bucket used by CodeBuild and CodePipeline
 7 |     Type: String
 8 |   pEnvironment:
 9 |     Description: Environment name
10 |     Type: String
11 |     AllowedValues: [dev, test, prod]
12 |   pDomain:
13 |     Description: Name of the data domain (all lowercase, no symbols or spaces)
14 |     Type: String
15 |   pTeamName:
16 |     Description: Name of the team (all lowercase, no symbols or spaces)
17 |     Type: String
18 |   pModuleName:
19 |     Description: Name of the module
20 |     Type: String
21 |   pModuleGitRef:
22 |     Description: Git reference (commit id) with the sources of this module version
23 |     Type: String
24 | 
25 | Resources:
26 |   rCloudFormationModule:
27 |       Type: AWS::CloudFormation::ModuleVersion
28 |       Properties:
29 |         ModuleName: !Sub "${pDomain}::${pTeamName}::${pModuleName}::MODULE"
30 |         ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pDomain}/${pEnvironment}/${pTeamName}/${pModuleName}-${pModuleGitRef}.zip"
31 | 
32 |   rCloudFormationModuleDefaultVersion:
33 |     Type: AWS::CloudFormation::ModuleDefaultVersion
34 |     Properties:
35 |       Arn: !Ref rCloudFormationModule
36 | 
37 |   rCloudFormationModuleSsm:
38 |     Type: AWS::SSM::Parameter
39 |     DependsOn: rCloudFormationModuleDefaultVersion
40 |     Properties:
41 |       Name: !Sub /SDLF/CFN/${pDomain}-${pTeamName}-${pModuleName}-MODULE
42 |       Type: String
43 |       Value: !Ref pModuleGitRef
44 |       Description: Git reference (commit id) with the sources of this module version
45 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-cicd-team-repository.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: CICD resources to handle deployment of a new team (repository)
 3 | 
 4 | Parameters:
 5 |   pKMSKey:
 6 |     Description: The KMS key used by CodeBuild and CodePipeline
 7 |     Type: AWS::SSM::Parameter::Value<String>
 8 |     Default: /SDLF/KMS/CICDKeyId
 9 |   pDomain:
10 |     Description: Name of the data domain (all lowercase, no symbols or spaces)
11 |     Type: String
12 |     AllowedPattern: "[a-z0-9]{2,9}"
13 |   pTeamName:
14 |     Description: Team name
15 |     Type: String
16 |   pGitPlatform:
17 |     Description: Platform used to host git repositories
18 |     Type: AWS::SSM::Parameter::Value<String>
19 |     Default: /SDLF/Misc/GitPlatform
20 |   pMainRepositoriesPrefix:
21 |     Type: String
22 |     Default: sdlf-main-
23 | 
24 | Conditions:
25 |   GitPlatformCodeCommit: !Equals [!Ref pGitPlatform, "CodeCommit"]
26 |   GitPlatformGitLab: !Equals [!Ref pGitPlatform, "GitLab"]
27 |   GitPlatformGitHub: !Equals [!Ref pGitPlatform, "GitHub"]
28 | 
29 | Resources:
30 |   rTeamMainCodeCommit:
31 |     Type: AWS::CodeCommit::Repository
32 |     Condition: GitPlatformCodeCommit
33 |     Metadata:
34 |       cfn-lint:
35 |         config:
36 |           ignore_checks:
37 |             - E3002
38 |     Properties:
39 |       Code:
40 |         BranchName: main
41 |         S3: ./README.md
42 |       RepositoryDescription: !Sub ${pDomain} ${pTeamName} main repository
43 |       RepositoryName: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName}
44 |       KmsKeyId: !Ref pKMSKey
45 | 
46 |   rTeamMainGitLab:
47 |     Type: GitLab::Projects::Project
48 |     Metadata:
49 |       cfn-lint:
50 |         config:
51 |           ignore_checks:
52 |           - E3001
53 |     Condition: GitPlatformGitLab
54 |     Properties:
55 |       Name: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName}
56 | #      Path: "{{resolve:ssm:/SDLF/${pGitPlatform}/Group}}"
57 | 
58 |   rTeamMainGitHub:
59 |     Type: GitHub::Repositories::Repository
60 |     Metadata:
61 |       cfn-lint:
62 |         config:
63 |           ignore_checks:
64 |           - E3001
65 |     Condition: GitPlatformGitHub
66 |     Properties:
67 |       Org: !Sub "{{resolve:ssm:/SDLF/${pGitPlatform}/Group}}"
68 |       Name: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName}
69 |       Private: true
70 |       Visibility: private
71 |       Archived: false
72 | 
73 |   rTeamMainCodeCommitSsm:
74 |     Type: AWS::SSM::Parameter
75 |     Properties:
76 |       Name: !Sub /SDLF/${pGitPlatform}/${pTeamName}/Main${pGitPlatform}
77 |       Type: String
78 |       Value: !If
79 |         - GitPlatformCodeCommit
80 |         - !GetAtt rTeamMainCodeCommit.Name
81 |         - !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} # !GetAtt rTeamMainGitLab.Name
82 |       Description: !Sub Name of the ${pDomain} ${pTeamName} main repository
83 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-generic-cfn-module.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Deploy a CloudFormation module
 3 | 
 4 | Parameters:
 5 |   pArtifactsBucket:
 6 |     Description: The artifacts bucket used by CodeBuild and CodePipeline
 7 |     Type: String
 8 |   pLibraryOrg:
 9 |     Description: Name of the org (all lowercase, no symbols or spaces)
10 |     Type: String
11 |   pLibraryFramework:
12 |     Description: Name of the framework (all lowercase, no symbols or spaces)
13 |     Type: String
14 |   pLibraryModule:
15 |     Description: Name of the module
16 |     Type: String
17 |   pModuleGitRef:
18 |     Description: Git reference (commit id) with the sources of this module version
19 |     Type: String
20 | 
21 | Resources:
22 |   rCloudFormationModule:
23 |       Type: AWS::CloudFormation::ModuleVersion
24 |       Properties:
25 |         ModuleName: !Sub "${pLibraryOrg}::${pLibraryFramework}::${pLibraryModule}::MODULE"
26 |         ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pLibraryOrg}/${pLibraryFramework}/${pLibraryModule}-${pModuleGitRef}.zip"
27 | 
28 |   rCloudFormationModuleDefaultVersion:
29 |     Type: AWS::CloudFormation::ModuleDefaultVersion
30 |     Properties:
31 |       Arn: !Ref rCloudFormationModule
32 | 
33 |   rCloudFormationModuleSsm:
34 |     Type: AWS::SSM::Parameter
35 |     DependsOn: rCloudFormationModuleDefaultVersion
36 |     Properties:
37 |       Name: !Sub /SDLF/CFN/${pLibraryOrg}-${pLibraryFramework}-${pLibraryModule}-MODULE
38 |       Type: String
39 |       Value: !Ref pModuleGitRef
40 |       Description: Git reference (commit id) with the sources of this module version


--------------------------------------------------------------------------------
/sdlf-cicd/template-generic-cfn-template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Deploy a CloudFormation module
 3 | 
 4 | Parameters:
 5 |   pModuleName:
 6 |     Description: Name of the module
 7 |     Type: String
 8 |   pModuleGitRef:
 9 |     Description: Git reference (commit id) with the sources of this module version
10 |     Type: String
11 |   pModuleS3Url:
12 |     Description: S3 URL (https) to the module template
13 |     Type: String
14 | 
15 | Resources:
16 |   rCloudFormationModuleSsm:
17 |     Type: AWS::SSM::Parameter
18 |     Properties:
19 |       Name: !Sub /sdlf/${pModuleName}/${pModuleGitRef}
20 |       Type: String
21 |       Value: !Ref pModuleS3Url
22 |       Description: S3 URL (https) to the module template of this module version
23 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-glue-job.part:
--------------------------------------------------------------------------------
 1 | 
 2 |   r%{BUILDSTEPVARIABLE_NOHYPHEN_AZ}GlueConnection:
 3 |     Type: AWS::Glue::Connection
 4 |     Condition: RunInVpc
 5 |     Metadata:
 6 |       cfn-lint:
 7 |         config:
 8 |           ignore_checks:
 9 |             - W3010
10 |     Properties:
11 |       CatalogId: !Ref AWS::AccountId
12 |       ConnectionInput:
13 |         ConnectionProperties: {}
14 |         ConnectionType: NETWORK
15 |         Description: "Network connected to the VPC data source"
16 |         Name: !Sub sdlf-${pTeamName}-glue-conn-%{BUILDSTEPVARIABLE_NOHYPHEN_AZ}
17 |         PhysicalConnectionRequirements:
18 |           AvailabilityZone: %{BUILDSTEPVARIABLE_AZ}
19 |           SecurityGroupIdList: !Split [",", !ImportValue sdlf-cicd-domain-roles-vpc-security-groups]
20 |           SubnetId: %{BUILDSTEPVARIABLE_SUBNET}
21 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-glue-job.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Transform: AWS::LanguageExtensions
  3 | Description: Deploy Glue jobs
  4 | 
  5 | Parameters:
  6 |   pArtifactsBucket:
  7 |     Description: The artifacts bucket used by CodeBuild and CodePipeline
  8 |     Type: AWS::SSM::Parameter::Value<String>
  9 |     Default: /SDLF/S3/ArtifactsBucket
 10 |   pTeamName:
 11 |     Description: Name of the team (all lowercase, no symbols or spaces)
 12 |     Type: String
 13 |   pGlueJobs:
 14 |     Description: List of glue job names
 15 |     Type: CommaDelimitedList
 16 |     AllowedPattern: "^[a-zA-Z0-9\\-]*$"
 17 |   pEnableVpc:
 18 |     Description: Deploy SDLF resources in a VPC
 19 |     Type: AWS::SSM::Parameter::Value<String>
 20 |     Default: /SDLF/VPC/Enabled
 21 | 
 22 | Conditions:
 23 |   GlueJobsNotEmpty: !Not
 24 |     - !Equals
 25 |       - !Join ["", !Ref pGlueJobs]
 26 |       - ""
 27 |   RunInVpc: !Equals [!Ref pEnableVpc, true]
 28 | 
 29 | Resources:
 30 |   rGlueRole:
 31 |     Type: AWS::IAM::Role
 32 |     Properties:
 33 |       Path: /service-role/
 34 |       PermissionsBoundary: !Sub "{{resolve:ssm:/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary}}"
 35 |       AssumeRolePolicyDocument:
 36 |         Version: 2012-10-17
 37 |         Statement:
 38 |           - Effect: Allow
 39 |             Principal:
 40 |               Service:
 41 |                 - glue.amazonaws.com
 42 |             Action:
 43 |               - sts:AssumeRole
 44 |       ManagedPolicyArns:
 45 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole
 46 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess
 47 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess
 48 |       Policies:
 49 |         - PolicyName: !Sub sdlf-${pTeamName}-glue-job
 50 |           PolicyDocument:
 51 |             Version: 2012-10-17
 52 |             Statement:
 53 |               - Effect: Allow
 54 |                 Action:
 55 |                   - kms:CreateGrant
 56 |                   - kms:Decrypt
 57 |                   - kms:DescribeKey
 58 |                   - kms:Encrypt
 59 |                   - kms:GenerateDataKey*
 60 |                   - kms:ReEncrypt*
 61 |                 Resource:
 62 |                   - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/InfraKeyId}}"
 63 |                   - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/DataKeyId}}"
 64 |                   - "{{resolve:ssm:/SDLF/KMS/KeyArn}}"
 65 | 
 66 |   "Fn::ForEach::GlueJobResources":
 67 |   - GlueJobName
 68 |   - !Ref pGlueJobs
 69 |   - "r&{GlueJobName}GlueJob":
 70 |       Type: AWS::Glue::Job
 71 |       Condition: GlueJobsNotEmpty
 72 |       Properties:
 73 |         Command:
 74 |           Name: glueetl
 75 |           PythonVersion: "3"
 76 |           ScriptLocation: !Sub s3://${pArtifactsBucket}/${pTeamName}/transforms/${GlueJobName}.py
 77 |         DefaultArguments: !If
 78 |           - RunInVpc
 79 |           -
 80 |             "--job-bookmark-option": job-bookmark-disable
 81 |             "--enable-glue-datacatalog": "true"
 82 |             "--enable-continuous-cloudwatch-log": "true"
 83 |             "--enable-continuous-log-filter": "true"
 84 |             "--enable-metrics": "true"
 85 |             "--disable-proxy-v2": "true"
 86 |           -
 87 |             "--job-bookmark-option": job-bookmark-disable
 88 |             "--enable-glue-datacatalog": "true"
 89 |             "--enable-continuous-cloudwatch-log": "true"
 90 |             "--enable-continuous-log-filter": "true"
 91 |             "--enable-metrics": "true"
 92 |         ExecutionProperty:
 93 |           MaxConcurrentRuns: 10
 94 |         MaxRetries: 0
 95 |         MaxCapacity: 2.0
 96 |         GlueVersion: "4.0"
 97 |         Name: !Sub
 98 |           - sdlf-${pTeamName}-${BaseGlueJobName}
 99 |           - BaseGlueJobName: !Select [0, !Split ["-", !Ref GlueJobName]]
100 |         SecurityConfiguration: !Sub "{{resolve:ssm:/SDLF/Glue/${pTeamName}/SecurityConfigurationId}}"
101 |         Role: !Ref rGlueRole
102 |         Connections: !If
103 |           - RunInVpc
104 |           - Connections:
105 |               - BUILDSTEPVARIABLE_GLUECONNECTIONS
106 |           - !Ref "AWS::NoValue"
107 | 


--------------------------------------------------------------------------------
/sdlf-cicd/template-lambda-layer.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Transform: AWS::LanguageExtensions
 3 | Description: Deploy Lambda Layers
 4 | 
 5 | Parameters:
 6 |   pArtifactsBucket:
 7 |     Description: The artifacts bucket used by CodeBuild and CodePipeline
 8 |     Type: String
 9 |   pDomain:
10 |     Description: Name of the data domain (all lowercase, no symbols or spaces)
11 |     Type: String
12 |   pEnvironment:
13 |     Description: Environment name
14 |     Type: String
15 |     AllowedValues: [dev, test, prod]
16 |   pTeamName:
17 |     Description: Name of the team (all lowercase, no symbols or spaces)
18 |     Type: String
19 |   pLayers:
20 |     Description: List of folder names from layers/ directory
21 |     Type: CommaDelimitedList
22 |     AllowedPattern: "^[a-zA-Z0-9]*$"
23 |   pGitRef:
24 |     Description: Git reference (commit id) with the sources of these layers
25 |     Type: String
26 | 
27 | Conditions:
28 |   DatalakeLibraryLayer: !Equals [!Ref pTeamName, sdlf]
29 | 
30 | Resources:
31 |   ######## LAMBDA LAYERS ########
32 |   "Fn::ForEach::LambdaLayerResources":
33 |   - LayerName
34 |   - !Ref pLayers
35 |   - "r${LayerName}LambdaLayer":
36 |       Type: AWS::Lambda::LayerVersion
37 |       Properties:
38 |         CompatibleRuntimes:
39 |           - python3.12
40 |         Content:
41 |           S3Bucket: !Ref pArtifactsBucket
42 |           S3Key: !Sub layers/${pDomain}/${pEnvironment}/${pTeamName}/${LayerName}-${pGitRef}.zip
43 |         Description: !Sub ${pTeamName} ${LayerName} Lambda Layer
44 |         LayerName:
45 |           !If [
46 |             DatalakeLibraryLayer,
47 |             !Sub "sdlf-${LayerName}",
48 |             !Sub "sdlf-${pTeamName}-${LayerName}"
49 |           ]
50 |     "r${LayerName}LambdaLayerSsm":
51 |       Type: AWS::SSM::Parameter
52 |       Properties:
53 |         Name:
54 |           !If [
55 |             DatalakeLibraryLayer,
56 |             !Sub "/SDLF/Lambda/Latest${LayerName}Layer",
57 |             !Sub "/SDLF/Lambda/${pTeamName}/Latest${LayerName}Layer"
58 |           ]
59 |         Type: String
60 |         Value: !Ref
61 |               "Fn::Sub": r${LayerName}LambdaLayer
62 |         Description: !Sub The ARN of the latest version of the ${pTeamName} ${LayerName} layer
63 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/README.md:
--------------------------------------------------------------------------------
 1 | # Datalake Library
 2 | The data lake library repository is where a team pushes the transformation code (i.e. business logic) that they wish to apply to their datasets. After each new commit, the repository is automatically packaged into a lambda layer and mounted to the individual Lambda functions of the pipelines belonging to the team. The repository also holds helper functions that automate boiler plate code such as SQS, S3, and DynamoDB operations.
 3 | 
 4 | ## IMPORTANT
 5 | Please ensure that you follow this file structure, with a folder named `python` at the root containing all the Lambda code that should be part of the Layer. The automated build process depends on the file structure being as follows: 
 6 |  
 7 |     ./
 8 |         ├── README.md
 9 |         └── python
10 |             └── datalake_library
11 |                 ├── configuration
12 |                 ├── interfaces
13 |                 ├── octagon
14 |                 ├── tests                
15 |                 └── transforms
16 |                     ├── stage_a_transforms
17 |                     │   ├── light_transform_blueprint.py
18 |                     │   ├── ...
19 |                     ├── stage_b_transforms
20 |                     │   ├── heavy_transform_blueprint.py
21 |                     │   └── ...
22 |                     └── transform_handler.py
23 |  
24 | ## Adding Transformations 
25 | When adding custom transformations to the Lambda Layer, simply add your code to this repository (see example of `light_transform_blueprint.py` in file structure above) in the relevant location (e.g. stage_a_transforms for light transformations in StageA). Any changes to this repository should stay in branches while in development, and once tested/stable, these changes can then be merged into the relevant environment branch (`dev, test or main`). The pipeline will trigger upon commits made to this branch, and release these changes automatically.
26 | 
27 | ## Pipeline
28 | The CICD pipeline for this repository is defined in the `sdlf-team` repository for each team (`nested-stacks/template-cicd.yaml`). A CodeBuild job is used to package the code in this repository into a `.zip` file, while leaving out any `__pycache__` files, which is then published as a Lambda Layer. Due to limitations on the size of packages, the code in this repository must not exceed 50MB when zipped, and no more than 250Mb unzipped.
29 | 
30 | Configuration details, e.g. the name of the Lambda Layer built from this repository, will be defined in the template containing the **sdlf-pipeline** infrastructure. Some of the configuration details available for customization:
31 | 1. Through the pipeline:
32 |    1. Main Git branch to use — currently set to `dev`
33 | 2. Through the CodeBuild job:
34 |    1. Name of the resulting Layer
35 |    2. Compatible runtimes
36 |    3. SSM parameter used to store the ARN of the latest version
37 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/buildspec.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CFN_ENDPOINT="https://cloudformation.$AWS_REGION.amazonaws.com"
 4 | 
 5 | pip uninstall -y aws-sam-cli && unzip -q aws-sam-cli-linux-x86_64.zip -d sam-installation
 6 | ./sam-installation/install && sam --version
 7 | pip install "cfn-lint<1" cloudformation-cli
 8 | 
 9 | # removing everything up to the first hyphen, then anything that isn't a letter/number, and lower-casing what's left
10 | module_name_without_prefix="${SDLF_CONSTRUCT#*-}"
11 | module_name_alnum="${module_name_without_prefix//[^[:alnum:]]/}"
12 | MODULE="${module_name_alnum,,}"
13 | MODULE="DatalakeLibrary" # TODO
14 | 
15 | mkdir artifacts
16 | zip -r artifacts/datalake_library.zip ./python -x \*__pycache__\*
17 | LAYER_HASH="$(sha256sum artifacts/datalake_library.zip | cut -c1-12)"
18 | aws s3api put-object --bucket "$ARTIFACTS_BUCKET" \
19 |     --key "sdlf/layers/$MODULE-$LAYER_HASH.zip" \
20 |     --body artifacts/datalake_library.zip
21 | 
22 | STACK_NAME="sdlf-lambdalayers-$MODULE"
23 | aws cloudformation --endpoint-url "$CFN_ENDPOINT" deploy \
24 |     --stack-name "$STACK_NAME" \
25 |     --template-file ./template-lambda-layer.yaml \
26 |     --parameter-overrides \
27 |         pArtifactsBucket="$ARTIFACTS_BUCKET" \
28 |         pLayerName="$MODULE" \
29 |         pGitRef="$LAYER_HASH" \
30 |     --tags Framework=sdlf \
31 |     --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" || exit 1
32 | 
33 | echo "done"
34 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-datalakeLibrary/python/__init__.py


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/commons.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
 3 | 
 4 | from boto3.dynamodb.types import TypeDeserializer, TypeSerializer
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from mypy_boto3_dynamodb.type_defs import (
 8 |         AttributeValueTypeDef,
 9 |     )
10 | 
11 | 
12 | def init_logger(file_name, log_level=None):
13 |     if not log_level:
14 |         log_level = "INFO"
15 |     logging.basicConfig()
16 |     logger = logging.getLogger(file_name)
17 |     logger.setLevel(getattr(logging, log_level))
18 |     return logger
19 | 
20 | 
21 | def serialize_dynamodb_item(
22 |     item: Mapping[str, Any], serializer: Optional[TypeSerializer] = None
23 | ) -> Dict[str, "AttributeValueTypeDef"]:
24 |     serializer = serializer if serializer else TypeSerializer()
25 |     return {k: serializer.serialize(v) for k, v in item.items()}
26 | 
27 | 
28 | def deserialize_dynamodb_item(
29 |     item: Mapping[str, "AttributeValueTypeDef"], deserializer: Optional[TypeDeserializer] = None
30 | ) -> Dict[str, Any]:
31 |     deserializer = deserializer if deserializer else TypeDeserializer()
32 |     return {k: deserializer.deserialize(v) for k, v in item.items()}
33 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/datalake_exceptions.py:
--------------------------------------------------------------------------------
 1 | class ObjectDeleteFailedException(Exception):
 2 |     """Raised when the lambda fails to delete a file(s)"""
 3 | 
 4 |     pass
 5 | 
 6 | 
 7 | class InvalidS3PutEventException(Exception):
 8 |     """Raised when the object added to the bucket according to the provided event does not match the expected pattern"""
 9 | 
10 |     pass
11 | 
12 | 
13 | class UnprocessedKeysException(RuntimeError):
14 |     """Raised when keys are unprocessed, either because the batch limit is exceeded, the size of the response is too big
15 |     (>16Mb) or the keys were throttled because of ProvisionedReads too low on ddb"""
16 | 
17 |     pass
18 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/interfaces/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-datalakeLibrary/python/datalake_library/interfaces/__init__.py


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/interfaces/sqs_interface.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | import uuid
 4 | 
 5 | import boto3
 6 | from botocore.client import Config
 7 | from botocore.exceptions import ClientError
 8 | 
 9 | from ..commons import init_logger
10 | 
11 | 
12 | class SQSInterface:
13 |     def __init__(self, queue_name, log_level=None, sqs_client=None):
14 |         self.log_level = log_level or os.getenv("LOG_LEVEL", "INFO")
15 |         self._logger = init_logger(__name__, self.log_level)
16 |         sqs_endpoint_url = "https://sqs." + os.getenv("AWS_REGION") + ".amazonaws.com"
17 |         session_config = Config(user_agent="awssdlf/2.10.0")
18 |         self._sqs_client = sqs_client or boto3.client("sqs", endpoint_url=sqs_endpoint_url, config=session_config)
19 | 
20 |         self._message_queue = self._sqs_client.get_queue_url(QueueName=queue_name)["QueueUrl"]
21 | 
22 |     def receive_messages(self, max_num_messages=1):
23 |         messages = self._sqs_client.receive_message(
24 |             QueueUrl=self._message_queue, MaxNumberOfMessages=max_num_messages, WaitTimeSeconds=1
25 |         )["Messages"]
26 |         for message in messages:
27 |             self._sqs_client.delete_message(QueueUrl=self._message_queue, ReceiptHandle=message["ReceiptHandle"])
28 |         return messages
29 | 
30 |     def receive_min_max_messages(self, min_items_process, max_items_process):
31 |         """Gets max_items_process messages from an SQS queue.
32 |         :param min_items_process: Minimum number of items to process.
33 |         :param max_items_process: Maximum number of items to process.
34 |         :return messages obtained
35 |         """
36 |         messages = []
37 |         num_messages_queue = int(
38 |             self._sqs_client.get_queue_attributes(
39 |                 QueueUrl=self._message_queue, AttributeNames=["ApproximateNumberOfMessages"]
40 |             )["Attributes"]["ApproximateNumberOfMessages"]
41 |         )
42 | 
43 |         # If not enough items to process, break with no messages
44 |         if (num_messages_queue == 0) or (min_items_process > num_messages_queue):
45 |             self._logger.info("Not enough messages - exiting")
46 |             return messages
47 | 
48 |         # Only pull batch sizes of max_batch_size
49 |         num_messages_queue = min(num_messages_queue, max_items_process)
50 |         max_batch_size = 10
51 |         batch_sizes = [max_batch_size] * math.floor(num_messages_queue / max_batch_size)
52 |         if num_messages_queue % max_batch_size > 0:
53 |             batch_sizes += [num_messages_queue % max_batch_size]
54 | 
55 |         for batch_size in batch_sizes:
56 |             resp_msg = self.receive_messages(max_num_messages=batch_size)
57 |             try:
58 |                 messages.extend(message["Body"] for message in resp_msg)
59 |             except KeyError:
60 |                 break
61 |         return messages
62 | 
63 |     def send_message_to_fifo_queue(self, message, group_id):
64 |         try:
65 |             self._sqs_client.send_message(
66 |                 QueueUrl=self._message_queue,
67 |                 MessageBody=message,
68 |                 MessageGroupId=group_id,
69 |                 MessageDeduplicationId=str(uuid.uuid1()),
70 |             )
71 |         except ClientError as e:
72 |             self._logger.error("Received error: %s", e, exc_info=True)
73 |             raise e
74 | 
75 |     def send_batch_messages_to_fifo_queue(self, messages, batch_size, group_id):
76 |         try:
77 |             chunks = [messages[x : x + batch_size] for x in range(0, len(messages), batch_size)]
78 |             for chunk in chunks:
79 |                 entries = []
80 |                 for x in chunk:
81 |                     entry = {
82 |                         "Id": str(uuid.uuid1()),
83 |                         "MessageBody": str(x),
84 |                         "MessageGroupId": group_id,
85 |                         "MessageDeduplicationId": str(uuid.uuid1()),
86 |                     }
87 |                     entries.append(entry)
88 |                 self._sqs_client.send_message_batch(QueueUrl=self._message_queue, Entries=entries)
89 |         except ClientError as e:
90 |             self._logger.error("Received error: %s", e, exc_info=True)
91 |             raise e
92 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/interfaces/states_interface.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from datetime import date, datetime
 4 | 
 5 | import boto3
 6 | from botocore.client import Config
 7 | 
 8 | from ..commons import init_logger
 9 | 
10 | 
11 | class StatesInterface:
12 |     def __init__(self, log_level=None, states_client=None):
13 |         self.log_level = log_level or os.getenv("LOG_LEVEL", "INFO")
14 |         self._logger = init_logger(__name__, self.log_level)
15 |         stepfunctions_endpoint_url = "https://states." + os.getenv("AWS_REGION") + ".amazonaws.com"
16 |         session_config = Config(user_agent="awssdlf/2.10.0")
17 |         self._states_client = states_client or boto3.client(
18 |             "stepfunctions", endpoint_url=stepfunctions_endpoint_url, config=session_config
19 |         )
20 | 
21 |     @staticmethod
22 |     def json_serial(obj):
23 |         """JSON serializer for objects not serializable by default"""
24 |         if isinstance(obj, (datetime, date)):
25 |             return obj.isoformat()
26 |         raise TypeError("Type %s not serializable" % type(obj))
27 | 
28 |     def get_all_step_functions(self):
29 |         self._logger.info("obtaining a list of all step functions")
30 |         pages = self._states_client.get_paginator("list_state_machines").paginate()
31 |         step_functions = []
32 |         for result in pages:
33 |             step_functions.extend(result["stateMachines"])
34 |         return step_functions
35 | 
36 |     def run_state_machine(self, machine_arn, message):
37 |         self._logger.info("running state machine with arn {}".format(machine_arn))
38 |         return self._states_client.start_execution(
39 |             stateMachineArn=machine_arn, input=json.dumps(message, default=self.json_serial)
40 |         )
41 | 
42 |     def describe_state_execution(self, execution_arn):
43 |         self._logger.info("describing {}".format(execution_arn))
44 |         response = self._states_client.describe_execution(executionArn=execution_arn)
45 |         return response["status"]
46 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.35.25
 2 | botocore==1.35.25
 3 | boto3-stubs-lite[dynamodb]==1.35.25
 4 | pytest==8.3.3
 5 | pytest-mock==3.14.0
 6 | python-dateutil==2.9.0
 7 | pytest-cov==5.0.0
 8 | mock==5.1.0
 9 | coverage==7.6.1
10 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/sdlf/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F401
 2 | import logging
 3 | 
 4 | from .__version__ import __title__, __version__
 5 | from .config import (  # noqa: F401;
 6 |     DynamoConfiguration,
 7 |     KMSConfiguration,
 8 |     S3Configuration,
 9 |     SQSConfiguration,
10 |     StateMachineConfiguration,
11 | )
12 | from .peh import PipelineExecutionHistoryAPI
13 | 
14 | name = "sdlf"
15 | 
16 | # Suppress boto3 logging
17 | logging.getLogger("boto3").setLevel(logging.CRITICAL)
18 | logging.getLogger("botocore").setLevel(logging.CRITICAL)
19 | logging.getLogger("s3transfer").setLevel(logging.CRITICAL)
20 | logging.getLogger("urllib3").setLevel(logging.CRITICAL)
21 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/sdlf/__version__.py:
--------------------------------------------------------------------------------
1 | __title__ = "SDLF"
2 | __version__ = "2.10.0"
3 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/python/datalake_library/sdlf/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | import uuid
 4 | 
 5 | 
 6 | def get_duration_sec(start_timestamp_str, end_timestamp_str):
 7 |     ts_format = "%Y-%m-%dT%H:%M:%S.%f%z"
 8 |     start_ts = datetime.datetime.strptime(start_timestamp_str, ts_format)
 9 |     end_ts = datetime.datetime.strptime(end_timestamp_str, ts_format)
10 |     return (end_ts - start_ts).total_seconds()
11 | 
12 | 
13 | # datetime.datetime.now(datetime.UTC)
14 | def get_timestamp_iso(current_time=datetime.datetime.now(datetime.UTC)):
15 |     return current_time.isoformat()
16 | 
17 | 
18 | # Return local date ISO formatted
19 | def get_local_date(local_time=datetime.datetime.now()):
20 |     return local_time.strftime("%Y-%m-%d")
21 | 
22 | 
23 | def is_not_empty(arg):
24 |     return (arg is not None) and (len(arg) != 0)
25 | 
26 | 
27 | def throw_if_none(arg, msg):
28 |     if arg is None:
29 |         raise ValueError(msg)
30 | 
31 | 
32 | def throw_none_or_empty(arg, msg):
33 |     if (arg is None) or (len(arg) == 0):
34 |         raise ValueError(msg)
35 | 
36 | 
37 | def validate_date(date_text):
38 |     try:
39 |         datetime.datetime.strptime(date_text, "%Y-%m-%d")
40 |     except ValueError:
41 |         raise ValueError("Incorrect date format, should be YYYY-MM-DD")
42 | 
43 | 
44 | def throw_if_false(condition, message):
45 |     if not condition:
46 |         raise ValueError(message)
47 | 
48 | 
49 | # Parses metrics string into a list of metric executions
50 | # E.g "Metric1#Metric2#Metric3" => ["Metric1", "Metric1#Metric2", "Metric1#Metric2#Metric3"]
51 | def parse_metrics(metrics_name):
52 |     sep = "#"
53 |     metric = []
54 |     arr = metrics_name.split(sep)
55 | 
56 |     if len(arr) != len(set(arr)):
57 |         raise ValueError("Duplicated metrics are not allowed!")
58 | 
59 |     if sep in metrics_name:
60 |         arr = metrics_name.split(sep)
61 |         m = []
62 |         for item in arr:
63 |             m.append(item)
64 |             metric.append(sep.join(m))
65 |     else:
66 |         metric.append(metrics_name)
67 |     return metric
68 | 
69 | 
70 | def get_ttl(ttl_days, start_date=datetime.datetime.today()):
71 |     """Get ttl value epoch format to insert into DDB TTL field
72 | 
73 |     Arguments:
74 |         ttl_days {int} -- Number of days to keep the record
75 | 
76 |     Keyword Arguments:
77 |         start_date {datetime} -- Starting timestamp (default: {datetime.datetime.today()})
78 | 
79 |     Returns:
80 |         int -- Value to insert into DynamoDB TTL field
81 |     """
82 |     ttl_date = start_date + datetime.timedelta(days=ttl_days)
83 |     expiry_ttl = int(time.mktime(ttl_date.timetuple()))
84 |     return expiry_ttl
85 | 
86 | 
87 | def is_valid_uuid(val):
88 |     try:
89 |         uuid.UUID(str(val))
90 |         return True
91 |     except ValueError:
92 |         return False
93 | 


--------------------------------------------------------------------------------
/sdlf-datalakeLibrary/template-lambda-layer.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Deploy Lambda Layer
 3 | 
 4 | Parameters:
 5 |   pArtifactsBucket:
 6 |     Description: The artifacts bucket used by CodeBuild and CodePipeline
 7 |     Type: String
 8 |   pLayerName:
 9 |     Description: Name of the lambda layer
10 |     Type: String
11 |     AllowedPattern: "^[a-zA-Z0-9]*$"
12 |   pGitRef:
13 |     Description: Git reference (commit id) with the sources of these layers
14 |     Type: String
15 | 
16 | Resources:
17 |   rDatalakeLibraryLambdaLayer:
18 |     Type: AWS::Lambda::LayerVersion
19 |     Properties:
20 |       CompatibleRuntimes:
21 |         - python3.12
22 |       Content:
23 |         S3Bucket: !Ref pArtifactsBucket
24 |         S3Key: !Sub sdlf/layers/${pLayerName}-${pGitRef}.zip
25 |       Description: !Sub ${pLayerName} Lambda Layer
26 |       LayerName: !Sub "sdlf-${pLayerName}"
27 | 
28 |   rDatalakeLibraryLambdaLayerSsm:
29 |     Type: AWS::SSM::Parameter
30 |     Properties:
31 |       Name: !Sub "/SDLF/Lambda/Latest${pLayerName}Layer"
32 |       Type: String
33 |       Value: !Ref rDatalakeLibraryLambdaLayer
34 |       Description: !Sub The ARN of the latest version of the ${pLayerName} layer
35 | 


--------------------------------------------------------------------------------
/sdlf-dataset/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-dataset/README.md:
--------------------------------------------------------------------------------
 1 | # sdlf-dataset
 2 | 
 3 | !!! note
 4 |     `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Dataset](../_static/sdlf-dataset.png)
 9 | 
10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset.
11 | 
12 | Examples of datasets are:
13 | 
14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables)
15 | - A group of files from a data source (e.g. XML files from a Telemetry system)
16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3)
17 | 
18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler.
19 | 
20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules.
21 | 
22 | ## Usage
23 | 
24 | ### CloudFormation with [sdlf-cicd](cicd.md)
25 | 
26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
27 | 
28 | ```
29 | rExample:
30 |     Type: awslabs::sdlf::dataset::MODULE
31 |     Properties:
32 |         pPipelineReference: !Ref pPipelineReference
33 |         pTeamName: iot
34 |         pDatasetName: legislators
35 | ```
36 | 
37 | ## Interface
38 | 
39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters:
40 | 
41 | | SSM Parameter                             | Description                                  | Comment                                      |
42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- |
43 | | `/SDLF/Datasets/{team}/{dataset}`         | Dataset-specific metadata for data pipelines |                                              |
44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler                    |                                              |
45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog"               |                                              |
46 | 


--------------------------------------------------------------------------------
/sdlf-dataset/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.dataset"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.11"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | aws-cdk-aws-glue-alpha = "^2.159.1a0"
22 | aws-cdk-aws-scheduler-alpha = "^2.159.1a0"
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 


--------------------------------------------------------------------------------
/sdlf-dataset/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-dataset/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-foundations/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-foundations/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.foundations"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.11"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | 
22 | [build-system]
23 | requires = ["poetry-core"]
24 | build-backend = "poetry.core.masonry.api"
25 | 


--------------------------------------------------------------------------------
/sdlf-foundations/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-foundations/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-foundations/src/lambda/catalog-redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import boto3
 5 | 
 6 | logger = logging.getLogger()
 7 | logger.setLevel(logging.INFO)
 8 | dlq_name = os.environ["DLQ"]
 9 | queue_name = os.environ["QUEUE"]
10 | sqs_endpoint_url = "https://sqs." + os.getenv("AWS_REGION") + ".amazonaws.com"
11 | sqs = boto3.client("sqs", endpoint_url=sqs_endpoint_url)
12 | 
13 | 
14 | def lambda_handler(event, context):
15 |     try:
16 |         dlq_queue_url = sqs.get_queue_url(QueueName=dlq_name)["QueueUrl"]
17 |         queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
18 | 
19 |         messages = sqs.receive_message(QueueUrl=dlq_queue_url, MaxNumberOfMessages=1, WaitTimeSeconds=1)["Messages"]
20 |         if len(messages) == 0 or messages is None:
21 |             logger.info("No messages found in {}".format(dlq_name))
22 |             return
23 | 
24 |         logger.info("Received {} messages".format(len(messages)))
25 |         for message in messages:
26 |             sqs.send_message(QueueUrl=queue_url, MessageBody=message["Body"])
27 |             sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"])
28 |             logger.info("Delete message succeeded")
29 |     except Exception as e:
30 |         logger.error("Fatal error", exc_info=True)
31 |         raise e
32 |     return
33 | 


--------------------------------------------------------------------------------
/sdlf-foundations/src/lambda/catalog/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | from datetime import UTC, datetime
 5 | from urllib.parse import unquote_plus
 6 | 
 7 | import boto3
 8 | from botocore.config import Config
 9 | from botocore.exceptions import ClientError
10 | 
11 | session_config = Config(user_agent_extra="awssdlf/2.10.0")
12 | 
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO)
15 | dynamodb = boto3.client("dynamodb", config=session_config)
16 | catalog_table = os.getenv("OBJECTMETADATA_TABLE")
17 | 
18 | 
19 | def parse_s3_event(s3_event):
20 |     return {
21 |         "bucket": {"S": s3_event["detail"]["bucket"]["name"]},
22 |         "key": {"S": unquote_plus(s3_event["detail"]["object"]["key"])},
23 |         "size": {"N": str(s3_event["detail"]["object"]["size"])},
24 |         "last_modified_date": {"S": s3_event["time"]},
25 |         "timestamp": {"N": str(int(round(datetime.now(UTC).timestamp() * 1000, 0)))},
26 |     }
27 | 
28 | 
29 | def put_item(table, item, key):
30 |     try:
31 |         response = dynamodb.put_item(
32 |             TableName=table,
33 |             Item=item,
34 |             ConditionExpression=f"attribute_not_exists({key})",
35 |         )
36 |     except ClientError as e:
37 |         if e.response["Error"]["Code"] == "ConditionalCheckFailedException":
38 |             logger.info(e.response["Error"]["Message"])
39 |         else:
40 |             raise
41 |     else:
42 |         return response
43 | 
44 | 
45 | def delete_item(table, key):
46 |     try:
47 |         response = dynamodb.delete_item(TableName=table, Key=key)
48 |     except ClientError as e:
49 |         logger.error("Fatal error", exc_info=True)
50 |         raise e
51 |     else:
52 |         return response
53 | 
54 | 
55 | def lambda_handler(event, context):
56 |     try:
57 |         logger.info(f"Received {len(event['Records'])} messages")
58 |         for record in event["Records"]:
59 |             logger.info("Parsing S3 Event")
60 |             message = json.loads(record["body"])
61 |             operation = message["detail-type"]
62 |             bucket = message["detail"]["bucket"]["name"]
63 |             key = unquote_plus(message["detail"]["object"]["key"])
64 |             id = f"s3://{bucket}/{key}"
65 | 
66 |             logger.info(f"Performing Dynamo {operation} operation")
67 |             if operation in ["Object Deleted"]:
68 |                 delete_item(catalog_table, {"id": id})
69 |             else:
70 |                 item = parse_s3_event(message)
71 |                 item["id"] = {"S": id}
72 |                 item["stage"] = {"S": bucket.split("-")[-1]}
73 |                 put_item(catalog_table, item, "id")
74 |     except Exception as e:
75 |         logger.error("Fatal error", exc_info=True)
76 |         raise e
77 | 


--------------------------------------------------------------------------------
/sdlf-foundations/src/lambda/replicate/src/event-create-delete-table.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "0",
 3 |   "id": "0000000-0000-5328-220a-21c060f6c3f4",
 4 |   "detail-type": "Glue Data Catalog Database State Change",
 5 |   "source": "aws.glue",
 6 |   "account": "123456789012",
 7 |   "time": "2019-01-16T18:08:48Z",
 8 |   "region": "us-east-1",
 9 |   "resources": [
10 |     "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/history",
11 |     "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/organizations"    
12 |   ],
13 |   "detail": {
14 |     "databaseName": "forecourt_datalake_dev_engineering_legislators_db",
15 |     "typeOfChange": "CreateTable",
16 |     "changedTables": [
17 |       "history",
18 |       "organizations"
19 |     ]
20 |   }
21 | }


--------------------------------------------------------------------------------
/sdlf-foundations/src/lambda/replicate/src/event-update-table.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "version":"0",
 3 |    "id":"1a2ac50f-11dc-111c-09f3-102e0932d2bf",
 4 |    "detail-type":"Glue Data Catalog Table State Change",
 5 |    "source":"aws.glue",
 6 |    "account":"123456789012",
 7 |    "time":"2020-07-08T12:20:19Z",
 8 |    "region":"us-east-1",
 9 |    "resources":[
10 |       "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/persons"
11 |    ],
12 |    "detail":{
13 |       "databaseName":"forecourt_datalake_dev_engineering_legislators_db",
14 |       "typeOfChange":"UpdateTable",
15 |       "tableName":"persons",
16 |       "changedPartitions":[
17 |    ]
18 |    }
19 | }


--------------------------------------------------------------------------------
/sdlf-monitoring/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-monitoring/kibana/generic_dashboard.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "e4069440-fb2c-11e9-86fb-19c7ff919e3c",
 4 |     "_type": "dashboard",
 5 |     "_source": {
 6 |       "title": "Generic Dashboard",
 7 |       "hits": 0,
 8 |       "description": "",
 9 |       "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15,\"i\":\"1\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"dec27710-fb2c-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":15,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#E24D42\"},\"legendOpen\":false}},\"id\":\"26819f40-fb2d-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":42,\"w\":48,\"h\":14,\"i\":\"5\"},\"embeddableConfig\":{\"vis\":{\"params\":{\"sort\":{\"columnIndex\":2,\"direction\":\"asc\"}}}},\"id\":\"60903910-fb2f-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":0,\"y\":15,\"w\":13,\"h\":12,\"i\":\"8\"},\"embeddableConfig\":{},\"id\":\"3c879b80-fbc5-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"9\",\"gridData\":{\"x\":37,\"y\":15,\"w\":11,\"h\":12,\"i\":\"9\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"/aws/lambda/f1-dl04-dev-activemq-process-a\":\"#EA6460\"},\"legendOpen\":false}},\"id\":\"945eca40-fbc5-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"12\",\"gridData\":{\"x\":13,\"y\":15,\"w\":24,\"h\":12,\"i\":\"12\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#5195CE\"},\"legendOpen\":false}},\"id\":\"131befe0-fbd3-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"13\",\"gridData\":{\"x\":0,\"y\":27,\"w\":24,\"h\":15,\"i\":\"13\"},\"embeddableConfig\":{},\"id\":\"20f6a2e0-fbce-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"15\",\"gridData\":{\"x\":24,\"y\":27,\"w\":24,\"h\":15,\"i\":\"15\"},\"version\":\"6.3.1\",\"type\":\"visualization\",\"id\":\"6506b6d0-1048-11ea-a53e-f38a7f594614\",\"embeddableConfig\":{}}]",
10 |       "optionsJSON": "{\"darkTheme\":false,\"hidePanelTitles\":false,\"useMargins\":true}",
11 |       "version": 1,
12 |       "timeRestore": false,
13 |       "kibanaSavedObjectMeta": {
14 |         "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
15 |       }
16 |     }
17 |   }
18 | ]


--------------------------------------------------------------------------------
/sdlf-pipeline/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-pipeline/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.pipeline"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | 
12 | packages = [
13 |     { include = "**/*", from = "src", to = "sdlf" },
14 | ]
15 | 
16 | exclude = ["**/*.yaml"]
17 | 
18 | [tool.poetry.dependencies]
19 | python = "^3.12"
20 | aws-cdk-lib = "^2.159.1"
21 | constructs = ">=10.0.0,<11.0.0"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/sdlf-pipeline/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-pipeline/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-stage-dataquality/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-stage-dataquality/lambda/initial-check/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | 
 4 | import boto3
 5 | from datalake_library.commons import init_logger
 6 | from datalake_library.configuration.resource_configs import DynamoConfiguration
 7 | from datalake_library.interfaces.dynamo_interface import DynamoInterface
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | dynamodb = boto3.client("dynamodb")
12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com"
13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url)
14 | glue_endpoint_url = "https://glue." + os.getenv("AWS_REGION") + ".amazonaws.com"
15 | glue = boto3.client("glue", endpoint_url=glue_endpoint_url)
16 | 
17 | 
18 | def get_glue_transform_details(bucket, team, dataset, env, pipeline, stage):
19 |     dynamo_config = DynamoConfiguration()
20 |     dynamo_interface = DynamoInterface(dynamo_config)
21 | 
22 |     transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}")
23 | 
24 |     glue_database = ssm.get_parameter(Name=f"/SDLF/Glue/{team}/{dataset}/DataCatalog")["Parameter"]["Value"]
25 |     glue_capacity = {"NumberOfWorkers": 5}
26 |     wait_time = 45
27 | 
28 |     dataquality_tables = []
29 | 
30 |     logger.info(f"Pipeline is {pipeline}, stage is {stage}")
31 |     if pipeline in transform_info.get("pipeline", {}):
32 |         if stage in transform_info["pipeline"][pipeline]:
33 |             logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}")
34 |             glue_capacity = transform_info["pipeline"][pipeline][stage].get("glue_capacity", glue_capacity)
35 |             wait_time = transform_info["pipeline"][pipeline][stage].get("wait_time", wait_time)
36 |             dataquality_tables = transform_info["pipeline"][pipeline][stage].get(
37 |                 "dataquality_tables", dataquality_tables
38 |             )
39 | 
40 |     return {
41 |         "DatabaseName": glue_database,
42 |         "wait_time": wait_time,
43 |         "dataquality_tables": dataquality_tables,
44 |         **glue_capacity,
45 |     }
46 | 
47 | 
48 | def lambda_handler(event, context):
49 |     """Calls custom transform developed by user
50 | 
51 |     Arguments:
52 |         event {dict} -- Dictionary with details on previous processing step
53 |         context {dict} -- Dictionary with details on Lambda context
54 | 
55 |     Returns:
56 |         {dict} -- Dictionary with Data Quality Job details
57 |     """
58 |     try:
59 |         logger.info("Fetching event data from previous step")
60 |         bucket = event["body"]["bucket"]
61 |         team = event["body"]["team"]
62 |         pipeline = event["body"]["pipeline"]
63 |         stage = event["body"]["pipeline_stage"]
64 |         dataset = event["body"]["dataset"]
65 |         env = event["body"]["env"]
66 | 
67 |         # Checking if Data Quality is enabled on tables
68 |         logger.info("Querying data quality enabled tables")
69 |         event["body"]["glue"] = get_glue_transform_details(bucket, team, dataset, env, pipeline, stage)
70 |         event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"])
71 |         logger.info(event["body"]["glue"])
72 | 
73 |         map_input = []
74 |         for table in event["body"]["glue"]["dataquality_tables"]:
75 |             map_item = copy.deepcopy(event)
76 |             map_item["body"]["glue"]["TableName"] = table
77 |             map_input.append(map_item)
78 | 
79 |     except Exception as e:
80 |         logger.error("Fatal error", exc_info=True)
81 |         raise e
82 |     return {"dataquality": map_input}
83 | 


--------------------------------------------------------------------------------
/sdlf-stage-dataquality/lambda/stage-redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import SQSConfiguration, StateMachineConfiguration
 6 | from datalake_library.interfaces.sqs_interface import SQSInterface
 7 | from datalake_library.interfaces.states_interface import StatesInterface
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     try:
14 |         team = os.environ["TEAM"]
15 |         pipeline = os.environ["PIPELINE"]
16 |         stage = os.environ["STAGE"]
17 |         state_config = StateMachineConfiguration(team, pipeline, stage)
18 |         sqs_config = SQSConfiguration(team, pipeline, stage)
19 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
20 | 
21 |         messages = dlq_interface.receive_messages(1)
22 |         if not messages:
23 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
24 |             return
25 | 
26 |         logger.info("Received {} messages".format(len(messages)))
27 |         for message in messages:
28 |             logger.info("Starting State Machine Execution")
29 |             if isinstance(message["Body"], str):
30 |                 response = json.loads(message["Body"])
31 |             StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response)
32 |             logger.info("Redrive message succeeded")
33 |     except Exception as e:
34 |         logger.error("Fatal error", exc_info=True)
35 |         raise e
36 |     return
37 | 


--------------------------------------------------------------------------------
/sdlf-stage-dataquality/lambda/stage-routing/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import (
 6 |     S3Configuration,
 7 |     SQSConfiguration,
 8 |     StateMachineConfiguration,
 9 | )
10 | from datalake_library.interfaces.sqs_interface import SQSInterface
11 | from datalake_library.interfaces.states_interface import StatesInterface
12 | 
13 | logger = init_logger(__name__)
14 | 
15 | 
16 | def lambda_handler(event, context):
17 |     """Checks if any items need processing and triggers state machine
18 |     Arguments:
19 |         event {dict} -- Dictionary with details on what needs processing
20 |         context {dict} -- Dictionary with details on Lambda context
21 |     """
22 | 
23 |     try:
24 |         records = event["Records"]
25 |         logger.info(f"Received {len(records)} messages")
26 |         response = {}
27 |         for record in records:
28 |             event_body = json.loads(json.loads(record["body"])["output"])[0]["body"]
29 |             logger.info(event_body)
30 |             team = event_body["team"]
31 |             pipeline = event_body["pipeline"]
32 |             stage = os.environ["PIPELINE_STAGE"]
33 |             dataset = event_body["dataset"]
34 |             org = event_body["org"]
35 |             domain = event_body["domain"]
36 |             env = event_body["env"]
37 |             stage_bucket = S3Configuration().stage_bucket
38 | 
39 |             response = {
40 |                 "statusCode": 200,
41 |                 "body": {
42 |                     "bucket": stage_bucket,
43 |                     "team": team,
44 |                     "pipeline": pipeline,
45 |                     "pipeline_stage": stage,
46 |                     "dataset": dataset,
47 |                     "org": org,
48 |                     "domain": domain,
49 |                     "env": env,
50 |                 },
51 |             }
52 |         if response:
53 |             logger.info("Starting State Machine Execution")
54 |             state_config = StateMachineConfiguration(team, pipeline, stage)
55 |             StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response)
56 |     except Exception as e:
57 |         # If failure send to DLQ
58 |         sqs_config = SQSConfiguration(team, pipeline, stage)
59 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
60 |         dlq_interface.send_message_to_fifo_queue(json.dumps(response), "failed")
61 |         logger.error("Fatal error", exc_info=True)
62 |         raise e
63 | 


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-ecsfargate/README.md


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.stage-ecsfargate"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf/stage" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.12"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | sdlf-pipeline = "^2.10.0"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-ecsfargate/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/src/lambda/error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import SQSConfiguration
 6 | from datalake_library.interfaces.sqs_interface import SQSInterface
 7 | 
 8 | logger = init_logger(__name__)
 9 | team = os.environ["TEAM"]
10 | dataset = os.environ["DATASET"]
11 | pipeline = os.environ["PIPELINE"]
12 | pipeline_stage = os.environ["PIPELINE_STAGE"]
13 | org = os.environ["ORG"]
14 | domain = os.environ["DOMAIN"]
15 | env = os.environ["ENV"]
16 | 
17 | 
18 | def lambda_handler(event, context):
19 |     try:
20 |         if isinstance(event, str):
21 |             event = json.loads(event)
22 | 
23 |         sqs_config = SQSConfiguration(team, pipeline, pipeline_stage)
24 |         sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name)
25 | 
26 |         logger.info("Execution Failed. Sending original payload to DLQ")
27 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
28 |     except Exception as e:
29 |         logger.error("Fatal error", exc_info=True)
30 |         raise e
31 | 


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/src/lambda/postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library import octagon
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.octagon import peh
 6 | 
 7 | logger = init_logger(__name__)
 8 | team = os.environ["TEAM"]
 9 | dataset = os.environ["DATASET"]
10 | pipeline = os.environ["PIPELINE"]
11 | pipeline_stage = os.environ["PIPELINE_STAGE"]
12 | org = os.environ["ORG"]
13 | domain = os.environ["DOMAIN"]
14 | env = os.environ["ENV"]
15 | 
16 | 
17 | def lambda_handler(event, context):
18 |     """Updates the S3 objects metadata catalog
19 | 
20 |     Arguments:
21 |         event {dict} -- Dictionary with details on previous processing step
22 |         context {dict} -- Dictionary with details on Lambda context
23 | 
24 |     Returns:
25 |         {dict} -- Dictionary with outcome of the process
26 |     """
27 |     try:
28 |         logger.info("Initializing Octagon client")
29 |         component = context.function_name.split("-")[-2].title()
30 |         octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build()
31 |         peh_id = event[0]["Items"][0]["transform"]["peh_id"]
32 |         peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id)
33 | 
34 |         partial_failure = False
35 |         for records in event:
36 |             for record in records:
37 |                 if "processed" not in record or not record["processed"]:
38 |                     partial_failure = True
39 | 
40 |         if not partial_failure:
41 |             octagon_client.update_pipeline_execution(
42 |                 status="{} {} Processing".format(pipeline_stage, component), component=component
43 |             )
44 |             octagon_client.end_pipeline_execution_success()
45 |         else:
46 |             raise Exception("Failure: Processing failed for one or more record")
47 | 
48 |     except Exception as e:
49 |         logger.error("Fatal error", exc_info=True)
50 |         octagon_client.end_pipeline_execution_failed(
51 |             component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}"
52 |         )
53 |         raise e
54 | 


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/src/lambda/redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def lambda_handler(event, context):
11 |     try:
12 |         sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"])
13 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
14 |         messages = dlq_interface.receive_messages(1)
15 |         if not messages:
16 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
17 |             return
18 | 
19 |         logger.info("Received {} messages".format(len(messages)))
20 |         queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
21 |         for message in messages:
22 |             queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
23 |             logger.info("Redrive message succeeded")
24 |     except Exception as e:
25 |         logger.error("Fatal error", exc_info=True)
26 |         raise e
27 |     return
28 | 


--------------------------------------------------------------------------------
/sdlf-stage-ecsfargate/src/state-machine/stage-ecsfargate.asl.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Comment": "Simple ECS Fargate-based transform",
  3 |   "StartAt": "Try",
  4 |   "States": {
  5 |     "Try": {
  6 |       "Type": "Parallel",
  7 |       "Branches": [
  8 |         {
  9 |           "StartAt": "Pass",
 10 |           "States": {
 11 |             "Pass": {
 12 |               "Type": "Pass",
 13 |               "Next": "Records",
 14 |               "Parameters": {
 15 |                 "Items.$": "States.StringToJson($)"
 16 |               }
 17 |             },
 18 |             "Records": {
 19 |               "Type": "Map",
 20 |               "ItemProcessor": {
 21 |                 "ProcessorConfig": {
 22 |                   "Mode": "DISTRIBUTED",
 23 |                   "ExecutionType": "STANDARD"
 24 |                 },
 25 |                 "StartAt": "Execute ECS Fargate Transformation",
 26 |                 "States": {
 27 |                   "Execute ECS Fargate Transformation": {
 28 |                     "Type": "Task",
 29 |                     "Resource": "arn:aws:states:::ecs:runTask.sync",
 30 |                     "Parameters": {
 31 |                       "LaunchType": "FARGATE",
 32 |                       "Cluster": "$.Items[0].transform.ecsfargate_cluster",
 33 |                       "TaskDefinition": "$.Items[0].transform.transform"
 34 |                     },
 35 |                     "End": true
 36 |                   }
 37 |                 }
 38 |               },
 39 |               "Next": "Post-update Catalog",
 40 |               "Label": "Records",
 41 |               "MaxConcurrency": 50,
 42 |               "ToleratedFailurePercentage": 100,
 43 |               "ItemBatcher": {
 44 |                 "MaxItemsPerBatch": 1
 45 |               },
 46 |               "InputPath": "$.Items"
 47 |             },
 48 |             "Post-update Catalog": {
 49 |               "Type": "Task",
 50 |               "Resource": "arn:aws:states:::lambda:invoke",
 51 |               "ResultPath": null,
 52 |               "Parameters": {
 53 |                 "Payload.$": "$",
 54 |                 "FunctionName": "${lPostMetadata}:$LATEST"
 55 |               },
 56 |               "Retry": [
 57 |                 {
 58 |                   "ErrorEquals": [
 59 |                     "Lambda.ServiceException",
 60 |                     "Lambda.AWSLambdaException",
 61 |                     "Lambda.SdkClientException",
 62 |                     "Lambda.TooManyRequestsException"
 63 |                   ],
 64 |                   "IntervalSeconds": 2,
 65 |                   "MaxAttempts": 6,
 66 |                   "BackoffRate": 2
 67 |                 }
 68 |               ],
 69 |               "End": true
 70 |             }
 71 |           }
 72 |         }
 73 |       ],
 74 |       "End": true,
 75 |       "Catch": [
 76 |         {
 77 |           "ErrorEquals": [
 78 |             "States.ALL"
 79 |           ],
 80 |           "ResultPath": null,
 81 |           "Next": "Error"
 82 |         }
 83 |       ]
 84 |     },
 85 |     "Error": {
 86 |       "Type": "Task",
 87 |       "Resource": "arn:aws:states:::lambda:invoke",
 88 |       "OutputPath": "$.Payload",
 89 |       "Parameters": {
 90 |         "Payload.$": "$",
 91 |         "FunctionName": "${lError}:$LATEST"
 92 |       },
 93 |       "Retry": [
 94 |         {
 95 |           "ErrorEquals": [
 96 |             "Lambda.ServiceException",
 97 |             "Lambda.AWSLambdaException",
 98 |             "Lambda.SdkClientException",
 99 |             "Lambda.TooManyRequestsException"
100 |           ],
101 |           "IntervalSeconds": 2,
102 |           "MaxAttempts": 6,
103 |           "BackoffRate": 2
104 |         }
105 |       ],
106 |       "Next": "Fail"
107 |     },
108 |     "Fail": {
109 |       "Type": "Fail"
110 |     }
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-emrserverless/README.md


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.stage-emrserverless"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf/stage" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.12"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | sdlf-pipeline = "^2.10.0"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-emrserverless/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/src/lambda/error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import SQSConfiguration
 6 | from datalake_library.interfaces.sqs_interface import SQSInterface
 7 | 
 8 | logger = init_logger(__name__)
 9 | team = os.environ["TEAM"]
10 | dataset = os.environ["DATASET"]
11 | pipeline = os.environ["PIPELINE"]
12 | pipeline_stage = os.environ["PIPELINE_STAGE"]
13 | org = os.environ["ORG"]
14 | domain = os.environ["DOMAIN"]
15 | env = os.environ["ENV"]
16 | 
17 | 
18 | def lambda_handler(event, context):
19 |     try:
20 |         if isinstance(event, str):
21 |             event = json.loads(event)
22 | 
23 |         sqs_config = SQSConfiguration(team, pipeline, pipeline_stage)
24 |         sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name)
25 | 
26 |         logger.info("Execution Failed. Sending original payload to DLQ")
27 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
28 |     except Exception as e:
29 |         logger.error("Fatal error", exc_info=True)
30 |         raise e
31 | 


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/src/lambda/postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library import octagon
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.octagon import peh
 6 | 
 7 | logger = init_logger(__name__)
 8 | team = os.environ["TEAM"]
 9 | dataset = os.environ["DATASET"]
10 | pipeline = os.environ["PIPELINE"]
11 | pipeline_stage = os.environ["PIPELINE_STAGE"]
12 | org = os.environ["ORG"]
13 | domain = os.environ["DOMAIN"]
14 | env = os.environ["ENV"]
15 | 
16 | 
17 | def lambda_handler(event, context):
18 |     """Updates the S3 objects metadata catalog
19 | 
20 |     Arguments:
21 |         event {dict} -- Dictionary with details on previous processing step
22 |         context {dict} -- Dictionary with details on Lambda context
23 | 
24 |     Returns:
25 |         {dict} -- Dictionary with outcome of the process
26 |     """
27 |     try:
28 |         logger.info("Initializing Octagon client")
29 |         component = context.function_name.split("-")[-2].title()
30 |         octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build()
31 |         peh_id = event[0]["Items"][0]["transform"]["peh_id"]
32 |         peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id)
33 | 
34 |         partial_failure = False
35 |         for records in event:
36 |             for record in records:
37 |                 if "processed" not in record or not record["processed"]:
38 |                     partial_failure = True
39 | 
40 |         if not partial_failure:
41 |             octagon_client.update_pipeline_execution(
42 |                 status="{} {} Processing".format(pipeline_stage, component), component=component
43 |             )
44 |             octagon_client.end_pipeline_execution_success()
45 |         else:
46 |             raise Exception("Failure: Processing failed for one or more record")
47 | 
48 |     except Exception as e:
49 |         logger.error("Fatal error", exc_info=True)
50 |         octagon_client.end_pipeline_execution_failed(
51 |             component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}"
52 |         )
53 |         raise e
54 | 


--------------------------------------------------------------------------------
/sdlf-stage-emrserverless/src/lambda/redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def lambda_handler(event, context):
11 |     try:
12 |         sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"])
13 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
14 |         messages = dlq_interface.receive_messages(1)
15 |         if not messages:
16 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
17 |             return
18 | 
19 |         logger.info("Received {} messages".format(len(messages)))
20 |         queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
21 |         for message in messages:
22 |             queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
23 |             logger.info("Redrive message succeeded")
24 |     except Exception as e:
25 |         logger.error("Fatal error", exc_info=True)
26 |         raise e
27 |     return
28 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/README.md:
--------------------------------------------------------------------------------
 1 | # sdlf-stage-glue (sdlf-stageB)
 2 | 
 3 | !!! note
 4 |     `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png)
 9 | 
10 | Run a Glue job.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rMainB:
20 |     Type: awslabs::sdlf::stageB::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}"
24 |         pStageName: B
25 |         pPipeline: main
26 |         pTeamName: iot
27 |         pTriggerType: schedule
28 |         pEventPattern: !Sub >-
29 |             {
30 |                 "source": ["aws.states"],
31 |                 "detail-type": ["Step Functions Execution Status Change"],
32 |                 "detail": {
33 |                     "status": ["SUCCEEDED"],
34 |                     "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"]
35 |                 }
36 |             }
37 |         pSchedule: "cron(*/5 * * * ? *)"
38 |         pEnableTracing: false
39 | ```
40 | 
41 | ## Interface
42 | 
43 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters:
44 | 
45 | | SSM Parameter                                        | Description                                                      | Comment                                      |
46 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- |
47 | | `/SDLF/SM/{team}/{pipeline}{stage}SM`                | Name of the DynamoDB used to store mappings to transformation    |                                              |
48 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.stage-glue"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf/stage" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.12"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | sdlf-pipeline = "^2.10.0"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-glue/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-stage-glue/src/lambda/error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | from datalake_library.sdlf import SQSConfiguration
 7 | 
 8 | logger = init_logger(__name__)
 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     try:
14 |         if isinstance(event, str):
15 |             event = json.loads(event)
16 | 
17 |         sqs_config = SQSConfiguration(instance=deployment_instance)
18 |         sqs_interface = SQSInterface(sqs_config.stage_dlq)
19 | 
20 |         logger.info("Execution Failed. Sending original payload to DLQ")
21 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
22 |     except Exception as e:
23 |         logger.error("Fatal error", exc_info=True)
24 |         raise e
25 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/src/lambda/postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI
 5 | 
 6 | logger = init_logger(__name__)
 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"]
 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"]
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     """Updates the S3 objects metadata catalog
14 | 
15 |     Arguments:
16 |         event {dict} -- Dictionary with details on previous processing step
17 |         context {dict} -- Dictionary with details on Lambda context
18 | 
19 |     Returns:
20 |         {dict} -- Dictionary with outcome of the process
21 |     """
22 |     try:
23 |         logger.info("Initializing Octagon client")
24 |         component = context.function_name.split("-")[-2].title()
25 |         pipeline_execution = PipelineExecutionHistoryAPI(
26 |             run_in_context="LAMBDA",
27 |             region=os.getenv("AWS_REGION"),
28 |             peh_table_instance=peh_table_instance,
29 |             manifests_table_instance=manifests_table_instance,
30 |         )
31 |         peh_id = event[0]["Items"][0]["transform"]["peh_id"]
32 |         pipeline_execution.retrieve_pipeline_execution(peh_id)
33 | 
34 |         partial_failure = False
35 |         # for records in event:
36 |         #     for record in records:
37 |         #         if "processed" not in record or not record["processed"]:
38 |         #             partial_failure = True
39 | 
40 |         if not partial_failure:
41 |             pipeline_execution.update_pipeline_execution(
42 |                 status=f"{deployment_instance} {component} Processing", component=component
43 |             )
44 |             pipeline_execution.end_pipeline_execution_success()
45 |         else:
46 |             raise Exception("Failure: Processing failed for one or more record")
47 | 
48 |     except Exception as e:
49 |         logger.error("Fatal error", exc_info=True)
50 |         pipeline_execution.end_pipeline_execution_failed(
51 |             component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}"
52 |         )
53 |         raise e
54 | 


--------------------------------------------------------------------------------
/sdlf-stage-glue/src/lambda/redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
 9 | 
10 | 
11 | def lambda_handler(event, context):
12 |     try:
13 |         sqs_config = SQSConfiguration(instance=deployment_instance)
14 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
15 |         messages = dlq_interface.receive_messages(1)
16 |         if not messages:
17 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
18 |             return
19 | 
20 |         logger.info("Received {} messages".format(len(messages)))
21 |         queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
22 |         for message in messages:
23 |             queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
24 |             logger.info("Redrive message succeeded")
25 |     except Exception as e:
26 |         logger.error("Fatal error", exc_info=True)
27 |         raise e
28 |     return
29 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/README.md:
--------------------------------------------------------------------------------
 1 | # sdlf-stage-lambda (sdlf-stageA)
 2 | 
 3 | !!! note
 4 |     `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png)
 9 | 
10 | Run a Lambda function.
11 | 
12 | ## Usage
13 | 
14 | ### CloudFormation with [sdlf-cicd](cicd.md)
15 | 
16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
17 | 
18 | ```
19 | rMainA:
20 |     Type: awslabs::sdlf::stageA::MODULE
21 |     Properties:
22 |         pPipelineReference: !Ref pPipelineReference
23 |         pStageName: A
24 |         pPipeline: main
25 |         pTeamName: iot
26 |         pTriggerType: event
27 |         pEventPattern: >-
28 |             {
29 |                 "source": ["aws.s3"],
30 |                 "detail-type": ["Object Created"],
31 |                 "detail": {
32 |                     "bucket": {
33 |                         "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"]
34 |                     },
35 |                     "object": {
36 |                         "key": [{ "prefix": "iot/legislators/" }]
37 |                     }
38 |                 }
39 |             }
40 |         pEnableTracing: false
41 | ```
42 | 
43 | ## Interface
44 | 
45 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters:
46 | 
47 | | SSM Parameter                                        | Description                                                      | Comment                                      |
48 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- |
49 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda                                                   |                                              |
50 | | `/SDLF/SM/{team}/{pipeline}{stage}SM`                | Step Functions                                                   |                                              |
51 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.stage-lambda"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf/stage" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.11"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | #sdlf-pipeline = "^2.10.0"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-stage-lambda/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-stage-lambda/src/lambda/error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | from datalake_library.sdlf import SQSConfiguration
 7 | 
 8 | logger = init_logger(__name__)
 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     try:
14 |         if isinstance(event, str):
15 |             event = json.loads(event)
16 | 
17 |         sqs_config = SQSConfiguration(instance=deployment_instance)
18 |         sqs_interface = SQSInterface(sqs_config.stage_dlq)
19 | 
20 |         logger.info("Execution Failed. Sending original payload to DLQ")
21 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
22 |     except Exception as e:
23 |         logger.error("Fatal error", exc_info=True)
24 |         raise e
25 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/src/lambda/postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI
 5 | 
 6 | logger = init_logger(__name__)
 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"]
 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"]
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     """Updates the S3 objects metadata catalog
14 | 
15 |     Arguments:
16 |         event {dict} -- Dictionary with details on previous processing step
17 |         context {dict} -- Dictionary with details on Lambda context
18 | 
19 |     Returns:
20 |         {dict} -- Dictionary with outcome of the process
21 |     """
22 |     try:
23 |         logger.info("Initializing Octagon client")
24 |         component = context.function_name.split("-")[-2].title()
25 |         pipeline_execution = PipelineExecutionHistoryAPI(
26 |             run_in_context="LAMBDA",
27 |             region=os.getenv("AWS_REGION"),
28 |             peh_table_instance=peh_table_instance,
29 |             manifests_table_instance=manifests_table_instance,
30 |         )
31 |         peh_id = event[0]["run_output"][0]["transform"]["peh_id"]
32 |         pipeline_execution.retrieve_pipeline_execution(peh_id)
33 | 
34 |         partial_failure = False
35 |         # for records in event:
36 |         #     for record in records:
37 |         #         if "processed" not in record or not record["processed"]:
38 |         #             partial_failure = True
39 | 
40 |         if not partial_failure:
41 |             pipeline_execution.update_pipeline_execution(
42 |                 status=f"{deployment_instance} {component} Processing", component=component
43 |             )
44 |             pipeline_execution.end_pipeline_execution_success()
45 |         else:
46 |             raise Exception("Failure: Processing failed for one or more record")
47 | 
48 |     except Exception as e:
49 |         logger.error("Fatal error", exc_info=True)
50 |         pipeline_execution.end_pipeline_execution_failed(
51 |             component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}"
52 |         )
53 |         raise e
54 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/src/lambda/process-object/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from pathlib import PurePath
 4 | 
 5 | from datalake_library.commons import init_logger
 6 | from datalake_library.interfaces.s3_interface import S3Interface
 7 | from datalake_library.sdlf import (
 8 |     KMSConfiguration,
 9 |     S3Configuration,
10 | )
11 | 
12 | logger = init_logger(__name__)
13 | s3_prefix = os.environ["S3_PREFIX"]
14 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
15 | storage_deployment_instance = os.environ["STORAGE_DEPLOYMENT_INSTANCE"]
16 | 
17 | 
18 | def transform_object(bucket, key):
19 |     s3_interface = S3Interface()
20 |     # IMPORTANT: Stage bucket where transformed data must be uploaded
21 |     stage_bucket = S3Configuration(instance=storage_deployment_instance).stage_bucket
22 |     # Download S3 object locally to /tmp directory
23 |     # The s3_helper.download_object method
24 |     # returns the local path where the file was saved
25 |     local_path = s3_interface.download_object(bucket, key)
26 | 
27 |     # Apply business business logic:
28 |     # Below example is opening a JSON file and
29 |     # extracting fields, then saving the file
30 |     # locally and re-uploading to Stage bucket
31 |     def parse(json_data):
32 |         l = []  # noqa: E741
33 |         for d in json_data:
34 |             o = d.copy()
35 |             for k in d:
36 |                 if type(d[k]) in [dict, list]:
37 |                     o.pop(k)
38 |             l.append(o)
39 | 
40 |         return l
41 | 
42 |     # Reading file locally
43 |     with open(local_path, "r") as raw_file:
44 |         data = raw_file.read()
45 | 
46 |     json_data = json.loads(data)
47 | 
48 |     # Saving file locally to /tmp after parsing
49 |     output_path = f"{PurePath(local_path).with_suffix('')}_parsed.json"
50 |     with open(output_path, "w", encoding="utf-8") as write_file:
51 |         json.dump(parse(json_data), write_file, ensure_ascii=False, indent=4)
52 | 
53 |     # Uploading file to Stage bucket at appropriate path
54 |     # IMPORTANT: Build the output s3_path without the s3://stage-bucket/
55 |     s3_path = f"{s3_prefix}/{deployment_instance}/{PurePath(output_path).name}"
56 |     # IMPORTANT: Notice "stage_bucket" not "bucket"
57 |     kms_key = KMSConfiguration(instance=storage_deployment_instance).data_kms_key
58 |     s3_interface.upload_object(output_path, stage_bucket, s3_path, kms_key=kms_key)
59 | 
60 |     return s3_path
61 | 
62 | 
63 | def lambda_handler(event, context):
64 |     """Calls custom transform developed by user
65 | 
66 |     Arguments:
67 |         event {dict} -- Dictionary with details on previous processing step
68 |         context {dict} -- Dictionary with details on Lambda context
69 | 
70 |     Returns:
71 |         {dict} -- Dictionary with Processed Bucket and Key(s)
72 |     """
73 |     try:
74 |         # this default Lambda expects records to be S3 events
75 |         for record in event:
76 |             logger.info(f"Processing file: {record['object']['key']} in {record['bucket']['name']}")
77 |             try:
78 |                 transform_object(record["bucket"]["name"], record["object"]["key"])
79 |                 record["processed"] = True
80 |             except json.decoder.JSONDecodeError as e:
81 |                 record["processed"] = False
82 |                 record["error"] = repr(e)
83 | 
84 |     except Exception as e:
85 |         logger.error("Fatal error", exc_info=True)
86 |         raise e
87 | 
88 |     return event
89 | 


--------------------------------------------------------------------------------
/sdlf-stage-lambda/src/lambda/redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.interfaces.sqs_interface import SQSInterface
 5 | from datalake_library.sdlf import SQSConfiguration
 6 | 
 7 | logger = init_logger(__name__)
 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"]
 9 | 
10 | 
11 | def lambda_handler(event, context):
12 |     try:
13 |         sqs_config = SQSConfiguration(instance=deployment_instance)
14 |         dlq_interface = SQSInterface(sqs_config.stage_dlq)
15 |         messages = dlq_interface.receive_messages(1)
16 |         if not messages:
17 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
18 |             return
19 | 
20 |         logger.info("Received {} messages".format(len(messages)))
21 |         queue_interface = SQSInterface(sqs_config.stage_queue)
22 |         for message in messages:
23 |             queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
24 |             logger.info("Redrive message succeeded")
25 |     except Exception as e:
26 |         logger.error("Fatal error", exc_info=True)
27 |         raise e
28 |     return
29 | 


--------------------------------------------------------------------------------
/sdlf-stageA/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-stageA/lambda/stage-a-error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def lambda_handler(event, context):
11 |     try:
12 |         if isinstance(event, str):
13 |             event = json.loads(event)
14 |         sqs_config = SQSConfiguration(event["team"], event["pipeline"], event["pipeline_stage"])
15 |         sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name)
16 | 
17 |         logger.info("Execution Failed. Sending original payload to DLQ")
18 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
19 |     except Exception as e:
20 |         logger.error("Fatal error", exc_info=True)
21 |         raise e
22 | 


--------------------------------------------------------------------------------
/sdlf-stageA/lambda/stage-a-postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | from datalake_library import octagon
 2 | from datalake_library.commons import init_logger
 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration, S3Configuration
 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface
 5 | from datalake_library.interfaces.s3_interface import S3Interface
 6 | from datalake_library.octagon import peh
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | def lambda_handler(event, context):
12 |     """Updates the S3 objects metadata catalog
13 | 
14 |     Arguments:
15 |         event {dict} -- Dictionary with details on previous processing step
16 |         context {dict} -- Dictionary with details on Lambda context
17 | 
18 |     Returns:
19 |         {dict} -- Dictionary with outcome of the process
20 |     """
21 |     try:
22 |         logger.info("Fetching event data from previous step")
23 |         processed_keys = event["body"]["processedKeys"]
24 |         team = event["body"]["team"]
25 |         pipeline = event["body"]["pipeline"]
26 |         stage = event["body"]["pipeline_stage"]
27 |         dataset = event["body"]["dataset"]
28 |         peh_id = event["body"]["peh_id"]
29 | 
30 |         logger.info("Initializing Octagon client")
31 |         component = context.function_name.split("-")[-2].title()
32 |         octagon_client = (
33 |             octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build()
34 |         )
35 |         peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id)
36 | 
37 |         logger.info("Initializing DynamoDB config and Interface")
38 |         dynamo_config = DynamoConfiguration()
39 |         dynamo_interface = DynamoInterface(dynamo_config)
40 | 
41 |         logger.info("Storing metadata to DynamoDB")
42 |         bucket = S3Configuration().stage_bucket
43 |         for key in processed_keys:
44 |             size, last_modified_date = S3Interface().get_size_and_last_modified(bucket, key)
45 |             object_metadata = {
46 |                 "bucket": bucket,
47 |                 "key": key,
48 |                 "size": size,
49 |                 "last_modified_date": last_modified_date,
50 |                 "org": event["body"]["org"],
51 |                 "app": event["body"]["domain"],
52 |                 "env": event["body"]["env"],
53 |                 "team": team,
54 |                 "pipeline": pipeline,
55 |                 "dataset": dataset,
56 |                 "stage": "stage",
57 |                 "pipeline_stage": stage,
58 |                 "peh_id": peh_id,
59 |             }
60 | 
61 |             dynamo_interface.update_object_metadata_catalog(object_metadata)
62 | 
63 |         octagon_client.update_pipeline_execution(
64 |             status="{} {} Processing".format(stage, component), component=component
65 |         )
66 |         octagon_client.end_pipeline_execution_success()
67 |     except Exception as e:
68 |         logger.error("Fatal error", exc_info=True)
69 |         octagon_client.end_pipeline_execution_failed(
70 |             component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))
71 |         )
72 |         raise e
73 |     return 200
74 | 


--------------------------------------------------------------------------------
/sdlf-stageA/lambda/stage-a-preupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library import octagon
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import DynamoConfiguration
 6 | from datalake_library.interfaces.dynamo_interface import DynamoInterface
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | def get_lambda_transform_details(team, dataset, pipeline, stage):
12 |     dynamo_config = DynamoConfiguration()
13 |     dynamo_interface = DynamoInterface(dynamo_config)
14 |     transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}")
15 |     lambda_arn = os.getenv("STAGE_TRANSFORM_LAMBDA")
16 |     logger.info(f"Pipeline is {pipeline}, stage is {stage}")
17 |     if pipeline in transform_info.get("pipeline", {}):
18 |         if stage in transform_info["pipeline"][pipeline]:
19 |             logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}")
20 |             lambda_arn = transform_info["pipeline"][pipeline][stage].get("lambda_arn", lambda_arn)
21 |     #######################################################
22 |     # We assume a Lambda function has already been created based on
23 |     # customer needs.
24 |     #######################################################
25 | 
26 |     return {"lambda_arn": lambda_arn}
27 | 
28 | 
29 | def lambda_handler(event, context):
30 |     """Updates the objects metadata catalog
31 | 
32 |     Arguments:
33 |         event {dict} -- Dictionary with details on S3 event
34 |         context {dict} -- Dictionary with details on Lambda context
35 | 
36 |     Returns:
37 |         {dict} -- Dictionary with Processed Bucket and Key
38 |     """
39 |     try:
40 |         logger.info("Fetching event data from previous step")
41 |         team = event["team"]
42 |         pipeline = event["pipeline"]
43 |         stage = event["pipeline_stage"]
44 |         dataset = event["dataset"]
45 | 
46 |         logger.info("Initializing Octagon client")
47 |         component = context.function_name.split("-")[-2].title()
48 |         octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["env"]).build()
49 |         event["peh_id"] = octagon_client.start_pipeline_execution(
50 |             pipeline_name="{}-{}-{}".format(team, pipeline, stage),
51 |             dataset_name="{}-{}".format(team, dataset),
52 |             comment=event,
53 |         )
54 |         # Add business metadata (e.g. event['project'] = 'xyz')
55 | 
56 |         logger.info("Initializing DynamoDB config and Interface")
57 |         dynamo_config = DynamoConfiguration()
58 |         dynamo_interface = DynamoInterface(dynamo_config)
59 | 
60 |         logger.info("Storing metadata to DynamoDB")
61 |         dynamo_interface.update_object_metadata_catalog(event)
62 | 
63 |         logger.info("Passing arguments to the next function of the state machine")
64 |         octagon_client.update_pipeline_execution(
65 |             status="{} {} Processing".format(stage, component), component=component
66 |         )
67 | 
68 |         event["lambda"] = get_lambda_transform_details(team, dataset, pipeline, stage)  # custom user code called
69 |     except Exception as e:
70 |         logger.error("Fatal error", exc_info=True)
71 |         octagon_client.end_pipeline_execution_failed(
72 |             component=component,
73 |             issue_comment="{} {} Error: {}".format(stage, component, repr(e)),
74 |         )
75 |         raise e
76 |     return {"statusCode": 200, "body": event}
77 | 


--------------------------------------------------------------------------------
/sdlf-stageA/lambda/stage-a-redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def lambda_handler(event, context):
11 |     try:
12 |         sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"])
13 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
14 |         messages = dlq_interface.receive_messages(1)
15 |         if not messages:
16 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
17 |             return
18 | 
19 |         logger.info("Received {} messages".format(len(messages)))
20 |         queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
21 |         for message in messages:
22 |             queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
23 |             logger.info("Redrive message succeeded")
24 |     except Exception as e:
25 |         logger.error("Fatal error", exc_info=True)
26 |         raise e
27 |     return
28 | 


--------------------------------------------------------------------------------
/sdlf-stageA/lambda/stage-a-routing/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import StateMachineConfiguration
 6 | from datalake_library.interfaces.states_interface import StatesInterface
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | def lambda_handler(event, context):
12 |     try:
13 |         logger.info("Received {} messages".format(len(event["Records"])))
14 |         for record in event["Records"]:
15 |             logger.info("Starting State Machine Execution")
16 |             event_body = json.loads(record["body"])
17 |             object_key = event_body["object"]["key"].split("/")
18 |             team = object_key[0]
19 |             dataset = object_key[1]
20 |             pipeline = os.environ["PIPELINE"]
21 |             pipeline_stage = os.environ["PIPELINE_STAGE"]
22 |             org = os.environ["ORG"]
23 |             domain = os.environ["DOMAIN"]
24 |             env = os.environ["ENV"]
25 | 
26 |             event_with_pipeline_details = {
27 |                 **event_body["object"],
28 |                 "bucket": event_body["bucket"]["name"],
29 |                 "team": team,
30 |                 "dataset": dataset,
31 |                 "pipeline": pipeline,
32 |                 "pipeline_stage": pipeline_stage,
33 |                 "org": org,
34 |                 "domain": domain,
35 |                 "env": env,
36 |             }
37 | 
38 |             state_config = StateMachineConfiguration(team, pipeline, pipeline_stage)
39 |             StatesInterface().run_state_machine(
40 |                 state_config.get_stage_state_machine_arn, json.dumps(event_with_pipeline_details)
41 |             )
42 |     except Exception as e:
43 |         logger.error("Fatal error", exc_info=True)
44 |         raise e
45 | 


--------------------------------------------------------------------------------
/sdlf-stageA/state-machine/stage-a.asl.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Comment": "Simple Lambda-based transform",
  3 |   "StartAt": "Try",
  4 |   "States": {
  5 |     "Try": {
  6 |       "Type": "Parallel",
  7 |       "Branches": [
  8 |         {
  9 |           "StartAt": "Pre-update Catalog",
 10 |           "States": {
 11 |             "Pre-update Catalog": {
 12 |               "Type": "Task",
 13 |               "Resource": "arn:aws:states:::lambda:invoke",
 14 |               "OutputPath": "$.Payload",
 15 |               "Parameters": {
 16 |                 "Payload.$": "$",
 17 |                 "FunctionName": "${lStep1}:$LATEST"
 18 |               },
 19 |               "Retry": [
 20 |                 {
 21 |                   "ErrorEquals": [
 22 |                     "Lambda.ServiceException",
 23 |                     "Lambda.AWSLambdaException",
 24 |                     "Lambda.SdkClientException",
 25 |                     "Lambda.TooManyRequestsException"
 26 |                   ],
 27 |                   "IntervalSeconds": 2,
 28 |                   "MaxAttempts": 6,
 29 |                   "BackoffRate": 2
 30 |                 }
 31 |               ],
 32 |               "Next": "Execute Light Transformation"
 33 |             },
 34 |             "Execute Light Transformation": {
 35 |               "Type": "Task",
 36 |               "Resource": "arn:aws:states:::lambda:invoke",
 37 |               "OutputPath": "$.Payload",
 38 |               "Parameters": {
 39 |                 "Payload.$": "$",
 40 |                 "FunctionName.$": "$.body.lambda.lambda_arn"
 41 |               },
 42 |               "Retry": [
 43 |                 {
 44 |                   "ErrorEquals": [
 45 |                     "Lambda.ServiceException",
 46 |                     "Lambda.AWSLambdaException",
 47 |                     "Lambda.SdkClientException",
 48 |                     "Lambda.TooManyRequestsException"
 49 |                   ],
 50 |                   "IntervalSeconds": 2,
 51 |                   "MaxAttempts": 6,
 52 |                   "BackoffRate": 2
 53 |                 }
 54 |               ],
 55 |               "Next": "Post-update Catalog"
 56 |             },
 57 |             "Post-update Catalog": {
 58 |               "Type": "Task",
 59 |               "Resource": "arn:aws:states:::lambda:invoke",
 60 |               "ResultPath": null,
 61 |               "Parameters": {
 62 |                 "Payload.$": "$",
 63 |                 "FunctionName": "${lStep3}:$LATEST"
 64 |               },
 65 |               "Retry": [
 66 |                 {
 67 |                   "ErrorEquals": [
 68 |                     "Lambda.ServiceException",
 69 |                     "Lambda.AWSLambdaException",
 70 |                     "Lambda.SdkClientException",
 71 |                     "Lambda.TooManyRequestsException"
 72 |                   ],
 73 |                   "IntervalSeconds": 2,
 74 |                   "MaxAttempts": 6,
 75 |                   "BackoffRate": 2
 76 |                 }
 77 |               ],
 78 |               "End": true
 79 |             }
 80 |           }
 81 |         }
 82 |       ],
 83 |       "End": true,
 84 |       "Catch": [
 85 |         {
 86 |           "ErrorEquals": [
 87 |             "States.ALL"
 88 |           ],
 89 |           "ResultPath": null,
 90 |           "Next": "Error"
 91 |         }
 92 |       ]
 93 |     },
 94 |     "Error": {
 95 |       "Type": "Task",
 96 |       "Resource": "arn:aws:states:::lambda:invoke",
 97 |       "OutputPath": "$.Payload",
 98 |       "Parameters": {
 99 |         "Payload.$": "$",
100 |         "FunctionName": "${lError}:$LATEST"
101 |       },
102 |       "Retry": [
103 |         {
104 |           "ErrorEquals": [
105 |             "Lambda.ServiceException",
106 |             "Lambda.AWSLambdaException",
107 |             "Lambda.SdkClientException",
108 |             "Lambda.TooManyRequestsException"
109 |           ],
110 |           "IntervalSeconds": 2,
111 |           "MaxAttempts": 6,
112 |           "BackoffRate": 2
113 |         }
114 |       ],
115 |       "Next": "Fail"
116 |     },
117 |     "Fail": {
118 |       "Type": "Fail"
119 |     }
120 |   }
121 | }


--------------------------------------------------------------------------------
/sdlf-stageB/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # Editors
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Mac/OSX
 9 | .DS_Store
10 | 
11 | # Windows
12 | Thumbs.db
13 | 
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 | 
19 | # Environments
20 | .env
21 | .venv


--------------------------------------------------------------------------------
/sdlf-stageB/lambda/stage-b-error/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from datalake_library.commons import init_logger
 4 | from datalake_library.configuration.resource_configs import SQSConfiguration
 5 | from datalake_library.interfaces.sqs_interface import SQSInterface
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def lambda_handler(event, context):
11 |     try:
12 |         if isinstance(event, str):
13 |             event = json.loads(event)
14 |         sqs_config = SQSConfiguration(event["body"]["team"], event["body"]["pipeline"], event["body"]["pipeline_stage"])
15 |         sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name)
16 | 
17 |         logger.info("Execution Failed. Sending original payload to DLQ")
18 |         sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
19 |     except Exception as e:
20 |         logger.error("Fatal error", exc_info=True)
21 |         raise e
22 | 


--------------------------------------------------------------------------------
/sdlf-stageB/lambda/stage-b-fetch-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | from datalake_library import octagon
 2 | from datalake_library.commons import init_logger
 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration
 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface
 5 | 
 6 | logger = init_logger(__name__)
 7 | 
 8 | 
 9 | def get_glue_transform_details(bucket, team, dataset, pipeline, stage):
10 |     dynamo_config = DynamoConfiguration()
11 |     dynamo_interface = DynamoInterface(dynamo_config)
12 |     transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}")
13 |     # we assume a Glue Job has already been created based on customer needs
14 |     job_name = f"sdlf-{team}-{dataset}-glue-job"  # Name of the Glue Job
15 |     glue_capacity = {"WorkerType": "G.1X", "NumberOfWorkers": 10}
16 |     wait_time = 60
17 |     glue_arguments = {
18 |         # Specify any arguments needed based on bucket and keys (e.g. input/output S3 locations)
19 |         "--SOURCE_LOCATION": f"s3://{bucket}/pre-stage/{team}/{dataset}",
20 |         "--OUTPUT_LOCATION": f"s3://{bucket}/post-stage/{team}/{dataset}",
21 |         "--job-bookmark-option": "job-bookmark-enable",
22 |     }
23 |     logger.info(f"Pipeline is {pipeline}, stage is {stage}")
24 |     if pipeline in transform_info.get("pipeline", {}):
25 |         if stage in transform_info["pipeline"][pipeline]:
26 |             logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}")
27 |             job_name = transform_info["pipeline"][pipeline][stage].get("job_name", job_name)
28 |             glue_capacity = transform_info["pipeline"][pipeline][stage].get("glue_capacity", glue_capacity)
29 |             wait_time = transform_info["pipeline"][pipeline][stage].get("wait_time", wait_time)
30 |             glue_arguments |= transform_info["pipeline"][pipeline][stage].get("glue_extra_arguments", {})
31 | 
32 |     return {"job_name": job_name, "wait_time": wait_time, "arguments": glue_arguments, **glue_capacity}
33 | 
34 | 
35 | def lambda_handler(event, context):
36 |     """Calls custom transform developed by user
37 | 
38 |     Arguments:
39 |         event {dict} -- Dictionary with details on previous processing step
40 |         context {dict} -- Dictionary with details on Lambda context
41 | 
42 |     Returns:
43 |         {dict} -- Dictionary with Processed Bucket and Key(s)
44 |     """
45 |     try:
46 |         logger.info("Fetching event data from previous step")
47 |         bucket = event["body"]["bucket"]
48 |         team = event["body"]["team"]
49 |         pipeline = event["body"]["pipeline"]
50 |         stage = event["body"]["pipeline_stage"]
51 |         dataset = event["body"]["dataset"]
52 | 
53 |         logger.info("Initializing Octagon client")
54 |         component = context.function_name.split("-")[-2].title()
55 |         octagon_client = (
56 |             octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build()
57 |         )
58 |         peh_id = octagon_client.start_pipeline_execution(
59 |             pipeline_name="{}-{}-{}".format(team, pipeline, stage),
60 |             dataset_name="{}-{}".format(team, dataset),
61 |             comment=event,
62 |         )
63 | 
64 |         # Call custom transform created by user and process the file
65 |         logger.info("Calling user custom processing code")
66 |         event["body"]["glue"] = get_glue_transform_details(
67 |             bucket, team, dataset, pipeline, stage
68 |         )  # custom user code called
69 |         event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"])
70 |         event["body"]["peh_id"] = peh_id
71 |         octagon_client.update_pipeline_execution(
72 |             status="{} {} Processing".format(stage, component), component=component
73 |         )
74 |     except Exception as e:
75 |         logger.error("Fatal error", exc_info=True)
76 |         octagon_client.end_pipeline_execution_failed(
77 |             component=component,
78 |             issue_comment="{} {} Error: {}".format(stage, component, repr(e)),
79 |         )
80 |         raise e
81 |     return event
82 | 


--------------------------------------------------------------------------------
/sdlf-stageB/lambda/stage-b-postupdate-metadata/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | from datalake_library import octagon
 2 | from datalake_library.commons import init_logger
 3 | from datalake_library.configuration.resource_configs import DynamoConfiguration
 4 | from datalake_library.interfaces.dynamo_interface import DynamoInterface
 5 | from datalake_library.interfaces.s3_interface import S3Interface
 6 | from datalake_library.octagon import peh
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | def lambda_handler(event, context):
12 |     """Updates the S3 objects metadata catalog
13 | 
14 |     Arguments:
15 |         event {dict} -- Dictionary with details on Bucket and Keys
16 |         context {dict} -- Dictionary with details on Lambda context
17 | 
18 |     Returns:
19 |         {dict} -- Dictionary with response
20 |     """
21 |     try:
22 |         logger.info("Fetching event data from previous step")
23 |         bucket = event["body"]["bucket"]
24 |         team = event["body"]["team"]
25 |         pipeline = event["body"]["pipeline"]
26 |         stage = event["body"]["pipeline_stage"]
27 |         dataset = event["body"]["dataset"]
28 |         peh_id = event["body"]["peh_id"]
29 |         processed_keys_path = f"post-stage/{team}/{dataset}"
30 |         processed_keys = S3Interface().list_objects(bucket, processed_keys_path)
31 | 
32 |         logger.info("Initializing Octagon client")
33 |         component = context.function_name.split("-")[-2].title()
34 |         octagon_client = (
35 |             octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(event["body"]["env"]).build()
36 |         )
37 |         peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id)
38 | 
39 |         logger.info("Initializing DynamoDB config and Interface")
40 |         dynamo_config = DynamoConfiguration()
41 |         dynamo_interface = DynamoInterface(dynamo_config)
42 | 
43 |         logger.info("Storing metadata to DynamoDB")
44 |         all_objects_metadata = []
45 |         for key in processed_keys:
46 |             size, last_modified_date = S3Interface().get_size_and_last_modified(bucket, key)
47 |             object_metadata = {
48 |                 "bucket": bucket,
49 |                 "key": key,
50 |                 "size": size,
51 |                 "last_modified_date": last_modified_date,
52 |                 "org": event["body"]["org"],
53 |                 "app": event["body"]["domain"],
54 |                 "env": event["body"]["env"],
55 |                 "team": team,
56 |                 "pipeline": pipeline,
57 |                 "dataset": dataset,
58 |                 "stage": "stage",
59 |                 "pipeline_stage": stage,
60 |                 "peh_id": peh_id,
61 |             }
62 |             all_objects_metadata.append(object_metadata)
63 |         dynamo_interface.batch_update_object_metadata_catalog(all_objects_metadata)
64 | 
65 |         octagon_client.update_pipeline_execution(
66 |             status="{} {} Processing".format(stage, component), component=component
67 |         )
68 |         octagon_client.end_pipeline_execution_success()
69 |     except Exception as e:
70 |         logger.error("Fatal error", exc_info=True)
71 |         octagon_client.end_pipeline_execution_failed(
72 |             component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))
73 |         )
74 |         raise e
75 |     return 200
76 | 


--------------------------------------------------------------------------------
/sdlf-stageB/lambda/stage-b-redrive/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from datalake_library.commons import init_logger
 5 | from datalake_library.configuration.resource_configs import SQSConfiguration, StateMachineConfiguration
 6 | from datalake_library.interfaces.sqs_interface import SQSInterface
 7 | from datalake_library.interfaces.states_interface import StatesInterface
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | 
12 | def lambda_handler(event, context):
13 |     try:
14 |         team = os.environ["TEAM"]
15 |         pipeline = os.environ["PIPELINE"]
16 |         stage = os.environ["STAGE"]
17 |         state_config = StateMachineConfiguration(team, pipeline, stage)
18 |         sqs_config = SQSConfiguration(team, pipeline, stage)
19 |         dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
20 | 
21 |         messages = dlq_interface.receive_messages(1)
22 |         if not messages:
23 |             logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
24 |             return
25 | 
26 |         logger.info("Received {} messages".format(len(messages)))
27 |         for message in messages:
28 |             logger.info("Starting State Machine Execution")
29 |             if isinstance(message["Body"], str):
30 |                 response = json.loads(message["Body"])
31 |             StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response)
32 |             logger.info("Redrive message succeeded")
33 |     except Exception as e:
34 |         logger.error("Fatal error", exc_info=True)
35 |         raise e
36 |     return
37 | 


--------------------------------------------------------------------------------
/sdlf-team/.gitignore:
--------------------------------------------------------------------------------
 1 | # Packaged Templates
 2 | output/
 3 | 
 4 | # CDK asset staging directory
 5 | .cdk.staging
 6 | cdk.out
 7 | 
 8 | # Python
 9 | __pycache__
10 | .pytest_cache
11 | *.egg-info
12 | 
13 | # Editors
14 | .vscode/
15 | .idea/
16 | *.swp
17 | 
18 | # Mac/OSX
19 | .DS_Store
20 | 
21 | # Windows
22 | Thumbs.db
23 | 
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | 
29 | # Environments
30 | .env
31 | .venv
32 | 


--------------------------------------------------------------------------------
/sdlf-team/README.md:
--------------------------------------------------------------------------------
 1 | # sdlf-team
 2 | 
 3 | !!! note
 4 |     `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework).
 5 | 
 6 | ## Infrastructure
 7 | 
 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"}
 9 | 
10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies.
11 | 
12 | As such `sdlf-team` is mostly about permissions.
13 | 
14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`.
15 | 
16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules.
17 | 
18 | !!! warning
19 |     The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members.
20 | 
21 | ## Usage
22 | 
23 | ### CloudFormation with [sdlf-cicd](cicd.md)
24 | 
25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example.
26 | 
27 | ```
28 | rExample:
29 |     Type: awslabs::sdlf::team::MODULE
30 |     Properties:
31 |         pPipelineReference: !Ref pPipelineReference
32 |         pTeamName: industry
33 |         pEnvironment: dev
34 |         pSNSNotificationsEmail: nobody@amazon.com
35 | ```
36 | 
37 | ## Interface
38 | 
39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters:
40 | 
41 | | SSM Parameter                                     | Description                                                     | Comment                                      |
42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- |
43 | | `/SDLF/Athena/{team}/WorkgroupName`               | Team Athena workgroup name                                      |                                              |
44 | | `/SDLF/EventBridge/{team}/EventBusName`           | Name of the team dedicated event bus                            |                                              |
45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName`      | Name of the team dedicated schedule group                       |                                              |
46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name                                |                                              |
47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn`           | IAM Role ARN for Glue crawlers                                  |                                              |
48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary`  | ARN of the permissions boundary IAM Managed policy for the team |                                              |
49 | | `/SDLF/KMS/${pTeamName}/DataKeyId`                | ARN of the team KMS data key                                    |                                              |
50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId`               | ARN of the team KMS infrastructure key                          |                                              |
51 | | `/SDLF/SNS/${pTeamName}/Notifications`            | ARN of the team-specific SNS Topic                              |                                              |
52 | 


--------------------------------------------------------------------------------
/sdlf-team/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sdlf.team"
 3 | version = "2.10.0"
 4 | description = "AWS Serverless Data Lake Framework"
 5 | authors = ["Amazon Web Services"]
 6 | license = "MIT-0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/"
 9 | documentation = "https://sdlf.readthedocs.io/en/latest/"
10 | 
11 | packages = [
12 |     { include = "**/*", from = "src", to = "sdlf" },
13 | ]
14 | 
15 | exclude = ["**/*.yaml"]
16 | 
17 | [tool.poetry.dependencies]
18 | python = "^3.12"
19 | aws-cdk-lib = "^2.159.1"
20 | constructs = ">=10.0.0,<11.0.0"
21 | 
22 | [build-system]
23 | requires = ["poetry-core"]
24 | build-backend = "poetry.core.masonry.api"
25 | 


--------------------------------------------------------------------------------
/sdlf-team/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/1cc98f5051b4145e76e9d3c1346b3126abe24989/sdlf-team/src/__init__.py


--------------------------------------------------------------------------------
/sdlf-team/src/lambda/datasets-dynamodb/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | 
 5 | import boto3
 6 | from boto3.dynamodb.types import TypeSerializer
 7 | 
 8 | logger = logging.getLogger()
 9 | logger.setLevel(logging.INFO)
10 | 
11 | dynamodb = boto3.client("dynamodb")
12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com"
13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url)
14 | 
15 | 
16 | def delete_dynamodb_dataset_entry(table_name, team_name, dataset_name):
17 |     response = dynamodb.delete_item(
18 |         TableName=table_name,
19 |         Key={"name": {"S": f"{team_name}-{dataset_name}"}},
20 |     )
21 |     return response
22 | 
23 | 
24 | def create_dynamodb_dataset_entry(table_name, team_name, dataset_name, pipeline_details):
25 |     pipeline_details_dynamodb_json = TypeSerializer().serialize(pipeline_details)
26 |     logger.info("PIPELINE DETAILS DYNAMODB JSON: %s", pipeline_details_dynamodb_json)
27 |     response = dynamodb.update_item(
28 |         TableName=table_name,
29 |         Key={"name": {"S": f"{team_name}-{dataset_name}"}},
30 |         ExpressionAttributeNames={
31 |             "#P": "pipeline",
32 |             "#V": "version",
33 |         },
34 |         ExpressionAttributeValues={
35 |             ":p": pipeline_details_dynamodb_json,
36 |             ":v": {"N": "1"},
37 |         },
38 |         UpdateExpression="SET #P = :p, #V = :v",
39 |         ReturnValues="UPDATED_NEW",
40 |     )
41 |     return response
42 | 
43 | 
44 | def lambda_handler(event, context):
45 |     try:
46 |         environment = os.getenv("ENVIRONMENT")
47 |         team_name = os.getenv("TEAM_NAME")
48 |         table = f"octagon-Datasets-{environment}"
49 | 
50 |         paginator = ssm.get_paginator("get_parameters_by_path")
51 |         datasets_pages = paginator.paginate(Path=f"/SDLF/Datasets/{team_name}")
52 | 
53 |         for datasets_page in datasets_pages:
54 |             for dataset in datasets_page["Parameters"]:
55 |                 dataset_name = dataset["Name"].split("/")[-1]
56 |                 logger.info("DATASET SSM CONTENT: %s", dataset["Value"])
57 |                 dataset_pipeline_details = json.loads(dataset["Value"])
58 |                 create_dynamodb_dataset_entry(table, team_name, dataset_name, dataset_pipeline_details)
59 |                 logger.info(f"{team_name}-{dataset_name} DynamoDB Dataset entry created")
60 | 
61 |         logger.info("INFO: Entries for datasets that no longer exist are not removed from DynamoDB")
62 |     except Exception as e:
63 |         message = "Function exception: " + str(e)
64 |         logger.error(message, exc_info=True)
65 |         raise
66 | 
67 |     return "Success"
68 | 


--------------------------------------------------------------------------------
/sdlf-team/src/lambda/pipelines-dynamodb/src/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import boto3
 5 | 
 6 | logger = logging.getLogger()
 7 | logger.setLevel(logging.INFO)
 8 | 
 9 | dynamodb = boto3.client("dynamodb")
10 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com"
11 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url)
12 | 
13 | 
14 | def delete_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name):
15 |     response = dynamodb.delete_item(
16 |         TableName=table_name,
17 |         Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}},
18 |     )
19 |     return response
20 | 
21 | 
22 | def create_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name):
23 |     response = dynamodb.update_item(
24 |         TableName=table_name,
25 |         Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}},
26 |         ExpressionAttributeNames={
27 |             "#T": "type",
28 |             "#S": "status",
29 |             "#P": "pipeline",
30 |             "#V": "version",
31 |         },
32 |         ExpressionAttributeValues={
33 |             ":t": {
34 |                 "S": "TRANSFORMATION",
35 |             },
36 |             ":s": {"S": "ACTIVE"},
37 |             ":p": {"M": {"max_items_process": {"N": "100"}, "min_items_process": {"N": "1"}}},
38 |             ":v": {"N": "1"},
39 |         },
40 |         UpdateExpression="SET #T = :t, #S = :s, #P = :p, #V = :v",
41 |         ReturnValues="UPDATED_NEW",
42 |     )
43 |     return response
44 | 
45 | 
46 | def lambda_handler(event, context):
47 |     try:
48 |         environment = os.getenv("ENVIRONMENT")
49 |         team_name = os.getenv("TEAM_NAME")
50 |         table = f"octagon-Pipelines-{environment}"
51 | 
52 |         paginator = ssm.get_paginator("get_parameters_by_path")
53 |         stages_pages = paginator.paginate(
54 |             Path=f"/SDLF/Pipelines/{team_name}",
55 |             Recursive=True,
56 |         )
57 |         for stages_page in stages_pages:
58 |             for stage in stages_page["Parameters"]:
59 |                 pipeline_name = stage["Name"].split("/")[-2]
60 |                 stage_name = stage["Name"].split("/")[-1]
61 |                 create_dynamodb_pipeline_entry(table, team_name, pipeline_name, stage_name)
62 |                 logger.info(f"{team_name}-{pipeline_name}-{stage_name} DynamoDB Pipeline entry created")
63 | 
64 |         logger.info("INFO: Entries for stages that no longer exist are *not* removed from DynamoDB")
65 |     except Exception as e:
66 |         message = "Function exception: " + str(e)
67 |         logger.error(message, exc_info=True)
68 |         raise
69 | 
70 |     return "Success"
71 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-demo/sdlf-workshop/dataset-legislators.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Example datasets
 3 | 
 4 | Parameters:
 5 |   pPipelineReference:
 6 |     Type: String
 7 |     Default: none
 8 | 
 9 | Resources:
10 |   rLegislators:
11 |     Type: AWS::CloudFormation::Stack
12 |     Properties:
13 |       TemplateURL: "{{resolve:ssm:/sdlf/dataset/main}}"
14 |       Parameters:
15 |         pPipelineReference: !Ref pPipelineReference
16 |         pS3Prefix: legislators
17 |         pDeploymentInstance: dev
18 |         pStorageDeploymentInstance: dev
19 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-demo/sdlf-workshop/foundations-datalake-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: SDLF Foundations in datalake domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |   rAnycompany:
11 |     Type: AWS::CloudFormation::Stack
12 |     Properties:
13 |       TemplateURL: "{{resolve:ssm:/sdlf/foundations/main}}"
14 |       Parameters:
15 |         pPipelineReference: !Ref pPipelineReference
16 |         pChildAccountId: !Ref AWS::AccountId
17 |         pOrg: anycompany
18 |         pDomain: datalake
19 |         pDeploymentInstance: dev
20 |         pCicdRole: Admin
21 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-demo/sdlf-workshop/pipeline-main.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Main pipeline
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |   rMainA:
11 |     Type: AWS::CloudFormation::Stack
12 |     Properties:
13 |       TemplateURL: "{{resolve:ssm:/sdlf/stagelambda/main}}"
14 |       Parameters:
15 |         pPipelineReference: !Ref pPipelineReference
16 |         pDeploymentInstance: mainA
17 |         pStorageDeploymentInstance: dev
18 |         pDatasetDeploymentInstance: dev
19 |         pTriggerType: event
20 |         pEventPattern: >-
21 |           {
22 |             "source": ["aws.s3"],
23 |             "detail-type": ["Object Created"],
24 |             "detail": {
25 |               "bucket": {
26 |                 "name": ["{{resolve:ssm:/sdlf/storage/rRawBucket/dev}}"]
27 |               },
28 |               "object": {
29 |                 "key": [{ "prefix": "legislators/" }]
30 |               }
31 |             }
32 |           }
33 |         pEnableTracing: false
34 | 
35 |   rMainB:
36 |     Type: AWS::CloudFormation::Stack
37 |     Properties:
38 |       TemplateURL: "{{resolve:ssm:/sdlf/stageglue/main}}"
39 |       Parameters:
40 |         pPipelineReference: !Ref pPipelineReference
41 |         pDeploymentInstance: mainB
42 |         pStorageDeploymentInstance: dev
43 |         pDatasetDeploymentInstance: dev
44 |         pGlueJobName: sdlf-mainB-glue-job
45 |         pGlueNumberOfWorkers: 10
46 |         pGlueWorkerType: G.1X
47 |         pTriggerType: schedule
48 |         pEventPattern: >-
49 |           {
50 |             "source": ["aws.s3"],
51 |             "detail-type": ["Object Created"],
52 |             "detail": {
53 |                 "bucket": {
54 |                     "name": ["{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}"]
55 |                 },
56 |                 "object": {
57 |                     "key": [{ "prefix": "legislators/mainA/" }]
58 |                 }
59 |             }
60 |           }
61 |         pSchedule: "cron(*/5 * * * ? *)"
62 |         pEnableTracing: false
63 |         pGlueArguments: >-
64 |           {
65 |             "--job-bookmark-option": "job-bookmark-enable",
66 |             "--enable-metrics": "",
67 |             "--enable-auto-scaling": "true",
68 |             "--SOURCE_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}/legislators/mainA",
69 |             "--OUTPUT_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rAnalyticsBucket/dev}}/legislators/mainB"
70 |           }
71 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-demo/sdlf-workshop/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |         "Framework" : "sdlf"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-demo/sdlf-workshop/team-datalake-engineering-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Engineering SDLF Team in datalake domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |   rEngineering:
11 |     Type: AWS::CloudFormation::Stack
12 |     Properties:
13 |       TemplateURL: "{{resolve:ssm:/sdlf/team/main}}"
14 |       Parameters:
15 |         pPipelineReference: !Ref pPipelineReference
16 |         pTeamName: engineering
17 |         pStorageDeploymentInstance: dev
18 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/datasets.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Engineering team datasets
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rLegislators:
11 |         Type: awslabs::sdlf::dataset::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pTeamName: engineering
15 |             pDatasetName: legislators
16 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipeline-main.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Engineering team Main pipeline
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMainA:
11 |         Type: awslabs::sdlf::stageA::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pStageName: A
15 |             pPipeline: main
16 |             pTeamName: engineering
17 |             pTriggerType: event
18 |             pEventPattern: >-
19 |                 {
20 |                 "source": ["aws.s3"],
21 |                 "detail-type": ["Object Created"],
22 |                 "detail": {
23 |                     "bucket": {
24 |                         "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"]
25 |                     },
26 |                     "object": {
27 |                         "key": [{ "prefix": "engineering/legislators/" }]
28 |                     }
29 |                 }
30 |                 }
31 |             pEnableTracing: false
32 | 
33 |     rMainB:
34 |         Type: awslabs::sdlf::stageB::MODULE
35 |         Properties:
36 |             pPipelineReference: !Ref pPipelineReference
37 |             pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}"
38 |             pStageName: B
39 |             pPipeline: main
40 |             pTeamName: engineering
41 |             pTriggerType: schedule
42 |             pEventPattern: !Sub >-
43 |                 {
44 |                 "source": ["aws.states"],
45 |                 "detail-type": ["Step Functions Execution Status Change"],
46 |                 "detail": {
47 |                     "status": ["SUCCEEDED"],
48 |                     "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-engineering-main-sm-A"]
49 |                 }
50 |                 }
51 |             pSchedule: "cron(*/5 * * * ? *)"
52 |             pEnableTracing: false
53 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipelines.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Engineering team pipelines
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMain:
11 |         Type: AWS::CloudFormation::Stack
12 |         DeletionPolicy: Delete
13 |         UpdateReplacePolicy: Delete
14 |         Properties:
15 |             TemplateURL: ./pipeline-main.yaml
16 |             Parameters:
17 |                 pPipelineReference: !Ref pPipelineReference
18 |             Tags:
19 |                 - Key: sdlf:pipeline
20 |                   Value: main
21 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |         "Framework" : "sdlf"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main/datadomain-datalake-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: datalake data domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rForecourt:
11 |         Type: AWS::CloudFormation::Stack
12 |         DeletionPolicy: Delete
13 |         UpdateReplacePolicy: Delete
14 |         Properties:
15 |             TemplateURL: ./foundations-datalake-dev.yaml
16 |             Parameters:
17 |                 pPipelineReference: !Ref pPipelineReference
18 | 
19 |     rEngineering:
20 |         Type: AWS::CloudFormation::Stack
21 |         DependsOn: rForecourt
22 |         DeletionPolicy: Delete
23 |         UpdateReplacePolicy: Delete
24 |         Properties:
25 |             TemplateURL: ./team-datalake-engineering-dev.yaml
26 |             Parameters:
27 |                 pPipelineReference: !Ref pPipelineReference
28 |             Tags:
29 |                 - Key: sdlf:team
30 |                   Value: engineering
31 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main/foundations-datalake-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: SDLF Foundations in datalake domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rForecourt:
11 |         Type: awslabs::sdlf::foundations::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pChildAccountId: !Ref "AWS::AccountId"
15 |             pOrg: forecourt
16 |             pDomain: datalake
17 |             pEnvironment: dev
18 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |         "Framework" : "sdlf"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/10-deployment/sdlf-main/team-datalake-engineering-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Engineering SDLF Team in datalake domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rEngineering:
11 |         Type: awslabs::sdlf::team::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pTeamName: engineering
15 |             pEnvironment: dev
16 |             pSNSNotificationsEmail: nobody@amazon.com
17 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/datasets.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: iot team datasets
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rLegislators:
11 |         Type: awslabs::sdlf::dataset::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pTeamName: iot
15 |             pDatasetName: legislators
16 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-main.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Main pipeline
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMainA:
11 |         Type: awslabs::sdlf::stageA::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pStageName: A
15 |             pPipeline: main
16 |             pTeamName: iot
17 |             pTriggerType: event
18 |             pEventPattern: >-
19 |                 {
20 |                     "source": ["aws.s3"],
21 |                     "detail-type": ["Object Created"],
22 |                     "detail": {
23 |                         "bucket": {
24 |                             "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"]
25 |                         },
26 |                         "object": {
27 |                             "key": [{ "prefix": "iot/legislators/" }]
28 |                         }
29 |                     }
30 |                 }
31 |             pEnableTracing: false
32 | 
33 |     rMainB:
34 |         Type: awslabs::sdlf::stageB::MODULE
35 |         Properties:
36 |             pPipelineReference: !Ref pPipelineReference
37 |             pDatasetBucket: "{{resolve:ssm:/SDLF/S3/StageBucket}}"
38 |             pStageName: B
39 |             pPipeline: main
40 |             pTeamName: iot
41 |             pTriggerType: schedule
42 |             pEventPattern: !Sub >-
43 |                 {
44 |                     "source": ["aws.states"],
45 |                     "detail-type": ["Step Functions Execution Status Change"],
46 |                     "detail": {
47 |                         "status": ["SUCCEEDED"],
48 |                         "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"]
49 |                     }
50 |                 }
51 |             pSchedule: "cron(*/5 * * * ? *)"
52 |             pEnableTracing: false
53 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-singlestage.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: Single stage pipeline
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rSingleA:
11 |         Type: awslabs::sdlf::stageA::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pStageName: A
15 |             pPipeline: singlestage
16 |             pTeamName: iot
17 |             pTriggerType: event
18 |             pEventPattern: >-
19 |                 {
20 |                     "source": ["aws.s3"],
21 |                     "detail-type": ["Object Created"],
22 |                     "detail": {
23 |                         "bucket": {
24 |                             "name": ["{{resolve:ssm:/SDLF/S3/RawBucket}}"]
25 |                         },
26 |                         "object": {
27 |                             "key": [{ "prefix": "iot/legislators/" }]
28 |                         }
29 |                     }
30 |                 }
31 |             pEnableTracing: false
32 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipelines.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: iot team pipelines
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMain:
11 |         Type: AWS::CloudFormation::Stack
12 |         DeletionPolicy: Delete
13 |         UpdateReplacePolicy: Delete
14 |         Properties:
15 |             TemplateURL: ./pipeline-main.yaml
16 |             Parameters:
17 |                 pPipelineReference: !Ref pPipelineReference
18 |             Tags:
19 |                 - Key: sdlf:pipeline
20 |                   Value: main
21 |     rSingleStage:
22 |         Type: AWS::CloudFormation::Stack
23 |         DeletionPolicy: Delete
24 |         UpdateReplacePolicy: Delete
25 |         Properties:
26 |             TemplateURL: ./pipeline-singlestage.yaml
27 |             Parameters:
28 |                 pPipelineReference: !Ref pPipelineReference
29 |             Tags:
30 |                 - Key: sdlf:pipeline
31 |                   Value: singlestage
32 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |         "Framework" : "sdlf",
4 |         "sdlf:team" : "iot"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-marketing-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: marketing data domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMarketing:
11 |         Type: AWS::CloudFormation::Stack
12 |         DeletionPolicy: Delete
13 |         UpdateReplacePolicy: Delete
14 |         Properties:
15 |             TemplateURL: ./foundations-marketing-dev.yaml
16 |             Parameters:
17 |                 pPipelineReference: !Ref pPipelineReference
18 | 
19 |     rIndustry:
20 |         Type: AWS::CloudFormation::Stack
21 |         DependsOn: rMarketing
22 |         DeletionPolicy: Delete
23 |         UpdateReplacePolicy: Delete
24 |         Properties:
25 |             TemplateURL: ./team-marketing-industry-dev.yaml
26 |             Parameters:
27 |                 pPipelineReference: !Ref pPipelineReference
28 |             Tags:
29 |                 - Key: sdlf:team
30 |                   Value: industry
31 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-proserve-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: proserve data domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rProserve:
11 |         Type: AWS::CloudFormation::Stack
12 |         DeletionPolicy: Delete
13 |         UpdateReplacePolicy: Delete
14 |         Properties:
15 |             TemplateURL: ./foundations-proserve-dev.yaml
16 |             Parameters:
17 |                 pPipelineReference: !Ref pPipelineReference
18 | 
19 |     rIot:
20 |         Type: AWS::CloudFormation::Stack
21 |         DependsOn: rProserve
22 |         DeletionPolicy: Delete
23 |         UpdateReplacePolicy: Delete
24 |         Properties:
25 |             TemplateURL: ./team-proserve-iot-dev.yaml
26 |             Parameters:
27 |                 pPipelineReference: !Ref pPipelineReference
28 |             Tags:
29 |                 - Key: sdlf:team
30 |                   Value: iot
31 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-marketing-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: SDLF Foundations in marketing domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rMarketing:
11 |         Type: awslabs::sdlf::foundations::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pChildAccountId: 222222222222
15 |             pOrg: forecourt
16 |             pDomain: marketing
17 |             pEnvironment: dev
18 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-proserve-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: SDLF Foundations in proserve domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rProserve:
11 |         Type: awslabs::sdlf::foundations::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pChildAccountId: 111111111111
15 |             pOrg: forecourt
16 |             pDomain: proserve
17 |             pEnvironment: dev
18 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/tags.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Tags" : {
3 |         "Framework" : "sdlf"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/team-marketing-industry-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Industry SDLF Team in marketing domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rIndustry:
11 |         Type: awslabs::sdlf::team::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pTeamName: industry
15 |             pEnvironment: dev
16 |             pSNSNotificationsEmail: nobody@amazon.com
17 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/20-production/sdlf-main/team-proserve-iot-dev.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: IOT SDLF Team in proserve domain, dev environment
 3 | 
 4 | Parameters:
 5 |     pPipelineReference:
 6 |         Type: String
 7 |         Default: none
 8 | 
 9 | Resources:
10 |     rIot:
11 |         Type: awslabs::sdlf::team::MODULE
12 |         Properties:
13 |             pPipelineReference: !Ref pPipelineReference
14 |             pTeamName: iot
15 |             pEnvironment: dev
16 |             pSNSNotificationsEmail: nobody@amazon.com
17 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/clean-up.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | STORAGE_DEPLOYMENT_INSTANCE=dev
 4 | DATASET_DEPLOYMENT_INSTANCE=dev
 5 | TEAM_NAME=engineering
 6 | #PRINCIPAL=
 7 | 
 8 | # echo "Granting Drop on Glue DBs"
 9 | # SDLF_ORG=$(aws ssm get-parameter --name "/sdlf/storage/rOrganization/$STORAGE_DEPLOYMENT_INSTANCE" --query "Parameter.Value" --output text)
10 | # for DB in $(aws glue get-databases | jq -r '.[][].Name')
11 | # do
12 | #   case "$DB" in 
13 | #     $SDLF_ORG*) aws lakeformation grant-permissions --principal DataLakePrincipalIdentifier="$PRINCIPAL" --permissions DROP --resource $(echo \'{\"Database\":{\"Name\":\"$DB\"}}\' | tr -d \');; 
14 | #     *) echo "Skipping non-SDLF database" ;; 
15 | #   esac
16 | # done
17 | 
18 | echo "Fetch KMS keys ARN - SSM parameters won't be available once stacks have been deleted"
19 | declare -a KEYS=("/sdlf/storage/rKMSKey/$STORAGE_DEPLOYMENT_INSTANCE"
20 |                   "/sdlf/dataset/rKMSInfraKey/$DATASET_DEPLOYMENT_INSTANCE"
21 |                   "/sdlf/dataset/rKMSDataKey/$DATASET_DEPLOYMENT_INSTANCE"
22 |                   "/SDLF/KMS/$TEAM_NAME/InfraKeyId"
23 |                 )
24 | KEYS_ARN=()
25 | for KEY in "${KEYS[@]}"
26 | do
27 |   echo "Finding $KEY ARN"
28 |   if KEY_ARN=$(aws ssm get-parameter --name "$KEY" --query "Parameter.Value" --output text); then
29 |     KEYS_ARN+=("$KEY_ARN")
30 |   else
31 |     echo "Key does not exist, skipping"
32 |   fi
33 | done
34 | 
35 | echo "Emptying SDLF buckets..."
36 | declare -a BUCKETS=("/sdlf/storage/rArtifactsBucket/$STORAGE_DEPLOYMENT_INSTANCE"
37 |                     "/sdlf/storage/rRawBucket/$STORAGE_DEPLOYMENT_INSTANCE"
38 |                     "/sdlf/storage/rStageBucket/$STORAGE_DEPLOYMENT_INSTANCE"
39 |                     "/sdlf/storage/rAnalyticsBucket/$STORAGE_DEPLOYMENT_INSTANCE"
40 |                     "/sdlf/storage/rAthenaBucket/$STORAGE_DEPLOYMENT_INSTANCE"
41 |                     "/sdlf/storage/rS3AccessLogsBucket/$STORAGE_DEPLOYMENT_INSTANCE"
42 |                     )
43 | for BUCKET in "${BUCKETS[@]}"
44 | do
45 |   echo "Finding $BUCKET bucket name"
46 |   if S3_BUCKET=$(aws ssm get-parameter --name "$BUCKET" --query "Parameter.Value" --output text); then
47 |     echo "Emptying $S3_BUCKET"
48 |     aws s3 rm "s3://$S3_BUCKET" --recursive
49 |     if [ "$(aws s3api get-bucket-versioning --bucket "$S3_BUCKET" --output text)" == "Enabled" ]; then
50 |       objects_versions=$(aws s3api list-object-versions --bucket "$S3_BUCKET" --output=json --query='{Objects: Versions[].{Key:Key,VersionId:VersionId}}')
51 |       if [ "$(jq -r ".Objects" <<< "$objects_versions")" != "null" ]; then
52 |         aws s3api delete-objects --bucket "$S3_BUCKET" --delete "$objects_versions"
53 |       fi
54 |     fi
55 |   else
56 |     echo "Bucket does not exist, skipping"
57 |   fi
58 | done
59 | 
60 | echo "Deleting SDLF stacks..."
61 | STACKS=$(aws cloudformation list-stacks --query "StackSummaries[?starts_with(StackName,'sdlf-') && StackStatus!='DELETE_COMPLETE']" | jq -r "sort_by(.CreationTime) | reverse[] | select(.ParentId == null) | .StackName")
62 | for STACK in $STACKS
63 | do
64 |   echo "Deleting stack $STACK"
65 |   aws cloudformation delete-stack --stack-name "$STACK"
66 | done
67 | for STACK in $STACKS
68 | do
69 |   echo "Waiting for $STACK stack delete to complete ..." && aws cloudformation wait stack-delete-complete --stack-name "$STACK" && echo "Finished delete successfully!"
70 | done
71 | 
72 | echo "Deleting KMS keys"
73 | for KEY_ARN in "${KEYS_ARN[@]}"
74 | do
75 |   echo "Deleting $KEY_ARN"
76 |     aws kms schedule-key-deletion --key-id "$KEY_ARN" --pending-window-in-days 7 2>&1
77 | done
78 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/legislators/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | pflag=false
 3 | 
 4 | DIRNAME=$(dirname "$0")
 5 | 
 6 | usage () { echo "
 7 |     -h -- Opens up this help message
 8 |     -p -- Name of the AWS profile to use
 9 | "; }
10 | options=':p:h'
11 | while getopts "$options" option
12 | do
13 |     case "$option" in
14 |         p  ) pflag=true; PROFILE=$OPTARG;;
15 |         h  ) usage; exit;;
16 |         \? ) echo "Unknown option: -$OPTARG" >&2; exit 1;;
17 |         :  ) echo "Missing option argument for -$OPTARG" >&2; exit 1;;
18 |         *  ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;;
19 |     esac
20 | done
21 | 
22 | if "$pflag"
23 | then
24 |     echo "using AWS profile $PROFILE..." >&2
25 | fi
26 | REGION=$(aws configure get region ${PROFILE:+--profile "$PROFILE"})
27 | 
28 | ARTIFACTS_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rArtifactsBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
29 | aws s3 cp "$DIRNAME/scripts/legislators-glue-job.py" "s3://$ARTIFACTS_BUCKET/artifacts/" ${PROFILE:+--profile "$PROFILE"}
30 | 
31 | mkdir "$DIRNAME"/output
32 | 
33 | function send_legislators() 
34 | {
35 |   ORIGIN="$DIRNAME/data/"
36 |   
37 |   RAW_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rRawBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
38 |   KMS_KEY=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/dataset/rKMSDataKey/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
39 | 
40 |   S3_DESTINATION=s3://$RAW_BUCKET/
41 |   COUNT=0
42 |   for FILE in "$ORIGIN"/*.json;
43 |   do
44 |     (( COUNT++ )) || true
45 |     aws s3 cp "$FILE" "${S3_DESTINATION}legislators/" --sse aws:kms --sse-kms-key-id "$KMS_KEY" ${PROFILE:+--profile "$PROFILE"}
46 |     echo "transferred $COUNT files"
47 |   done
48 | }
49 | 
50 | VPC_SUPPORT=$(aws --region "$REGION" ssm get-parameter --name "/SDLF/VPC/Enabled" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"} 2>/dev/null)
51 | if [ -z "$VPC_SUPPORT" ]
52 | then
53 |   aws --region "$REGION" ssm put-parameter --name "/SDLF/VPC/Enabled" --value "false" --type String ${PROFILE:+--profile "$PROFILE"}
54 | fi
55 | 
56 | aws cloudformation package --template-file "$DIRNAME"/scripts/legislators-glue-job.yaml \
57 |   --s3-bucket "$ARTIFACTS_BUCKET" \
58 |   ${PROFILE:+--profile "$PROFILE"} \
59 |   --output-template-file "$DIRNAME"/output/packaged-template.yaml
60 | 
61 | STACK_NAME="sdlf-legislators-glue-job"
62 | aws cloudformation deploy \
63 |     --s3-bucket "$ARTIFACTS_BUCKET" --s3-prefix sdlf-utils \
64 |     --stack-name "$STACK_NAME" \
65 |     --template-file "$DIRNAME"/output/packaged-template.yaml \
66 |     --tags Framework=sdlf \
67 |     --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \
68 |     --region "$REGION" \
69 |     ${PROFILE:+--profile "$PROFILE"} || exit 1
70 | 
71 | send_legislators
72 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from awsglue.context import GlueContext
 4 | from awsglue.job import Job
 5 | from awsglue.transforms import Join
 6 | from awsglue.utils import getResolvedOptions
 7 | from pyspark.context import SparkContext
 8 | 
 9 | args = getResolvedOptions(sys.argv, ["JOB_NAME", "SOURCE_LOCATION", "OUTPUT_LOCATION"])
10 | source = args["SOURCE_LOCATION"]
11 | destination = args["OUTPUT_LOCATION"]
12 | 
13 | glueContext = GlueContext(SparkContext.getOrCreate())
14 | job = Job(glueContext)
15 | job.init(args["JOB_NAME"], args)
16 | 
17 | persons = glueContext.create_dynamic_frame.from_options(
18 |     connection_type="s3",
19 |     format="json",
20 |     connection_options={"paths": ["{}/{}".format(source, "persons_parsed.json")]},
21 |     format_options={"withHeader": False},
22 |     transformation_ctx="path={}".format("persons_df"),
23 | )
24 | 
25 | memberships = glueContext.create_dynamic_frame.from_options(
26 |     connection_type="s3",
27 |     format="json",
28 |     connection_options={"paths": ["{}/{}".format(source, "memberships_parsed.json")]},
29 |     format_options={"withHeader": False},
30 |     transformation_ctx="path={}".format("memberships_df"),
31 | )
32 | 
33 | organizations = (
34 |     glueContext.create_dynamic_frame.from_options(
35 |         connection_type="s3",
36 |         format="json",
37 |         connection_options={"paths": ["{}/{}".format(source, "organizations_parsed.json")]},
38 |         format_options={"withHeader": False},
39 |         transformation_ctx="path={}".format("organizations_df"),
40 |     )
41 |     .rename_field("id", "org_id")
42 |     .rename_field("name", "org_name")
43 | )
44 | 
45 | history = Join.apply(
46 |     organizations, Join.apply(persons, memberships, "id", "person_id"), "org_id", "organization_id"
47 | ).drop_fields(["person_id", "org_id"])
48 | 
49 | persons.toDF().write.mode("overwrite").parquet("{}/persons/".format(destination))
50 | organizations.toDF().write.mode("overwrite").parquet("{}/organizations/".format(destination))
51 | memberships.toDF().write.mode("overwrite").parquet("{}//memberships/".format(destination))
52 | history.toDF().write.mode("overwrite").parquet("{}/history/".format(destination), partitionBy=["org_name"])
53 | 
54 | job.commit()
55 | 


--------------------------------------------------------------------------------
/sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Glue Job Sample
 3 | 
 4 | Parameters:
 5 |   pPipelineDeploymentInstance:
 6 |     Type: String
 7 |     Description: specific pipeline stage deployment instance this job is for
 8 |     Default: mainB
 9 |   pArtifactsBucket:
10 |     Description: S3 bucket used to store artifacts (from CICD or generated by data pipelines)
11 |     Type: AWS::SSM::Parameter::Value<String>
12 |     Default: /sdlf/storage/rArtifactsBucket/dev
13 |   pEnableVpc:
14 |     Description: Deploy SDLF resources in a VPC
15 |     Type: AWS::SSM::Parameter::Value<String>
16 |     Default: /SDLF/VPC/Enabled
17 | 
18 | Conditions:
19 |   RunInVpc: !Equals [!Ref pEnableVpc, true]
20 | 
21 | Resources:
22 |   rGlueRole:
23 |     Type: AWS::IAM::Role
24 |     Properties:
25 |       Path: /service-role/
26 |       AssumeRolePolicyDocument:
27 |         Version: 2012-10-17
28 |         Statement:
29 |           - Effect: Allow
30 |             Principal:
31 |               Service:
32 |                 - glue.amazonaws.com
33 |             Action:
34 |               - sts:AssumeRole
35 |       ManagedPolicyArns:
36 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole
37 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess
38 |         - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess
39 |       Policies:
40 |         - PolicyName: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job
41 |           PolicyDocument:
42 |             Version: 2012-10-17
43 |             Statement:
44 |               - Effect: Allow
45 |                 Action:
46 |                   - kms:CreateGrant
47 |                   - kms:Decrypt
48 |                   - kms:DescribeKey
49 |                   - kms:Encrypt
50 |                   - kms:GenerateDataKey*
51 |                   - kms:ReEncrypt*
52 |                 Resource:
53 |                   - "{{resolve:ssm:/sdlf/dataset/rKMSInfraKey/dev:1}}"
54 |                   - "{{resolve:ssm:/sdlf/dataset/rKMSDataKey/dev:1}}"
55 |                   - "{{resolve:ssm:/sdlf/storage/rKMSKey/dev:1}}"
56 | 
57 |   rGlueJob:
58 |     Type: AWS::Glue::Job
59 |     Properties:
60 |       Command:
61 |         Name: glueetl
62 |         PythonVersion: "3"
63 |         ScriptLocation: !Sub s3://${pArtifactsBucket}/artifacts/${pPipelineDeploymentInstance}-glue-job.py
64 |       DefaultArguments: !If
65 |         - RunInVpc
66 |         -
67 |           "--job-bookmark-option": job-bookmark-enable
68 |           "--enable-metrics": ""
69 |           "--disable-proxy-v2": "true"
70 |         -
71 |           "--job-bookmark-option": job-bookmark-enable
72 |           "--enable-metrics": ""
73 |       ExecutionProperty:
74 |         MaxConcurrentRuns: 3
75 |       MaxRetries: 0
76 |       MaxCapacity: 2.0
77 |       GlueVersion: "4.0"
78 |       Name: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job
79 |       SecurityConfiguration: "{{resolve:ssm:/sdlf/dataset/rGlueSecurityConfiguration/dev:1}}"
80 |       Role: !Ref rGlueRole
81 | 


--------------------------------------------------------------------------------
/validate.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | shopt -s globstar
 4 | 
 5 | # python
 6 | ruff format --check .
 7 | ruff check .
 8 | # pylint $(git ls-files --exclude-standard '*.py') # pylint is disabled for now
 9 | trivy fs --scanners vuln .
10 | 
11 | # shell
12 | find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \
13 | | xargs -0 shellcheck -x --format gcc
14 | 
15 | # cloudformation
16 | cfn-lint ./**/*.yaml
17 | 
18 | ## unfortunately cfn_nag doesn't support fn::foreach so we exclude files using it: https://github.com/stelligent/cfn_nag/issues/621
19 | find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \
20 | | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path
21 | 


--------------------------------------------------------------------------------