├── sdlf-cicd ├── README.md ├── tags.json ├── .gitignore ├── template-generic-cfn-template.yaml ├── template-glue-job.part ├── template-generic-cfn-module.yaml ├── template-cfn-module.yaml ├── lambda │ └── stagesrepositories-cicd │ │ └── src │ │ └── lambda_function.py ├── template-lambda-layer.yaml ├── template-cicd-team-repository.yaml └── template-glue-job.yaml ├── sdlf-dataset ├── src │ └── __init__.py ├── .gitignore ├── pyproject.toml └── README.md ├── sdlf-team ├── src │ ├── __init__.py │ └── lambda │ │ ├── datasets-dynamodb │ │ └── src │ │ │ └── lambda_function.py │ │ └── pipelines-dynamodb │ │ └── src │ │ └── lambda_function.py ├── .gitignore ├── pyproject.toml └── README.md ├── sdlf-foundations ├── src │ ├── __init__.py │ └── lambda │ │ ├── replicate │ │ └── src │ │ │ ├── event-update-table.json │ │ │ └── event-create-delete-table.json │ │ ├── catalog-redrive │ │ └── src │ │ │ └── lambda_function.py │ │ └── catalog │ │ └── src │ │ └── lambda_function.py ├── .gitignore └── pyproject.toml ├── sdlf-pipeline ├── src │ └── __init__.py ├── .gitignore └── pyproject.toml ├── sdlf-stage-ecsfargate ├── README.md ├── src │ ├── __init__.py │ ├── lambda │ │ ├── error │ │ │ └── src │ │ │ │ └── lambda_function.py │ │ ├── redrive │ │ │ └── src │ │ │ │ └── lambda_function.py │ │ └── postupdate-metadata │ │ │ └── src │ │ │ └── lambda_function.py │ └── state-machine │ │ └── stage-ecsfargate.asl.json ├── .gitignore └── pyproject.toml ├── sdlf-stage-glue ├── src │ ├── __init__.py │ └── lambda │ │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ │ └── postupdate-metadata │ │ └── src │ │ └── lambda_function.py ├── .gitignore ├── pyproject.toml └── README.md ├── sdlf-stage-lambda ├── src │ ├── __init__.py │ └── lambda │ │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ │ ├── postupdate-metadata │ │ └── src │ │ │ └── lambda_function.py │ │ └── process-object │ │ └── src │ │ └── lambda_function.py ├── .gitignore ├── pyproject.toml └── README.md ├── sdlf-stage-emrserverless ├── README.md ├── src │ ├── __init__.py │ └── lambda │ │ ├── error │ │ └── src │ │ │ └── lambda_function.py │ │ ├── redrive │ │ └── src │ │ │ └── lambda_function.py │ │ └── postupdate-metadata │ │ └── src │ │ └── lambda_function.py ├── .gitignore └── pyproject.toml ├── sdlf-datalakeLibrary ├── python │ ├── __init__.py │ └── datalake_library │ │ ├── __init__.py │ │ ├── interfaces │ │ ├── __init__.py │ │ ├── kms_interface.py │ │ ├── states_interface.py │ │ ├── dynamo_interface.py │ │ ├── base_interface.py │ │ ├── s3_interface.py │ │ └── sqs_interface.py │ │ ├── commons.py │ │ └── client.py ├── .gitignore ├── template-lambda-layer.yaml └── buildspec.sh ├── docs ├── requirements.txt ├── _static │ ├── sail-icon.ico │ ├── sail-icon.png │ ├── sdlf-cicd.png │ ├── sdlf-team.png │ ├── sdlf-dataset.png │ ├── sdlf-pipeline.png │ ├── sdlf-monitoring.png │ ├── sdlf-stage-glue.png │ ├── public-references.png │ ├── sdlf-foundations.png │ ├── sdlf-in-a-nutshell.png │ ├── sdlf-pipeline-full.png │ ├── sdlf-stage-lambda.png │ ├── sdlf-stage-dataquality.png │ ├── sdlf-layers-architecture.png │ ├── sdlf-architecture-datalake.png │ ├── sdlf-architecture-datamesh.png │ ├── sdlf-cicd-gluejobsdeployer.png │ ├── drawio │ │ ├── sdlf-pipeline.drawio │ │ ├── sdlf-dataset.drawio │ │ └── sdlf-monitoring.drawio │ └── sdlf-logo.svg ├── constructs │ ├── monitoring.md │ ├── stage-glue.md │ ├── stage-dataquality.md │ ├── stage-lambda.md │ ├── dataset.md │ └── team.md ├── architecture.md └── index.md ├── sdlf-utils └── workshop-examples │ ├── 10-demo │ └── sdlf-workshop │ │ ├── tags.json │ │ ├── dataset-legislators.yaml │ │ ├── team-datalake-engineering-dev.yaml │ │ ├── foundations-datalake-dev.yaml │ │ └── pipeline-main.yaml │ ├── 10-deployment │ ├── sdlf-main │ │ ├── tags.json │ │ ├── team-datalake-engineering-dev.yaml │ │ ├── foundations-datalake-dev.yaml │ │ └── datadomain-datalake-dev.yaml │ └── sdlf-main-datalake-engineering │ │ ├── tags.json │ │ ├── datasets.yaml │ │ ├── pipelines.yaml │ │ └── pipeline-main.yaml │ ├── 20-production │ ├── sdlf-main │ │ ├── tags.json │ │ ├── team-proserve-iot-dev.yaml │ │ ├── team-marketing-industry-dev.yaml │ │ ├── foundations-proserve-dev.yaml │ │ ├── foundations-marketing-dev.yaml │ │ ├── datadomain-proserve-dev.yaml │ │ └── datadomain-marketing-dev.yaml │ └── sdlf-main-proserve-iot │ │ ├── tags.json │ │ ├── datasets.yaml │ │ ├── pipelines.yaml │ │ ├── pipeline-singlestage.yaml │ │ └── pipeline-main.yaml │ ├── legislators │ ├── README.md │ ├── scripts │ │ ├── legislators-glue-job.py │ │ └── legislators-glue-job.yaml │ └── deploy.sh │ └── clean-up.sh ├── .cfnlintrc ├── .github ├── ISSUE_TEMPLATE │ ├── question.md │ ├── feature_request.md │ ├── bug_report.md │ └── support-the-sdlf.md └── workflows │ └── static-checking.yml ├── .readthedocs.yml ├── sdlf-stageA ├── .gitignore ├── lambda │ ├── stage-a-error │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-redrive │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-a-routing │ │ └── src │ │ │ └── lambda_function.py │ └── stage-a-process-object │ │ └── src │ │ └── lambda_function.py └── state-machine │ └── stage-a.asl.json ├── sdlf-stageB ├── .gitignore ├── lambda │ ├── stage-b-error │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-b-redrive │ │ └── src │ │ │ └── lambda_function.py │ ├── stage-b-fetch-metadata │ │ └── src │ │ │ └── lambda_function.py │ └── stage-b-routing │ │ └── src │ │ └── lambda_function.py └── state-machine │ └── stage-b.asl.json ├── .cfn-nag-deny-list.yml ├── sdlf-monitoring ├── .gitignore └── kibana │ └── generic_dashboard.json ├── sdlf-stage-dataquality ├── .gitignore └── lambda │ ├── stage-redrive │ └── src │ │ └── lambda_function.py │ ├── stage-routing │ └── src │ │ └── lambda_function.py │ └── initial-check │ └── src │ └── lambda_function.py ├── CODE_OF_CONDUCT.md ├── validate.sh ├── pyproject.toml ├── LICENSE ├── .mkdocs.yml └── .gitignore /sdlf-cicd/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-dataset/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-team/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-foundations/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-pipeline/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.6.1 2 | mkdocs-material==9.5.36 3 | -------------------------------------------------------------------------------- /sdlf-cicd/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /docs/_static/sail-icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sail-icon.ico -------------------------------------------------------------------------------- /docs/_static/sail-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sail-icon.png -------------------------------------------------------------------------------- /docs/_static/sdlf-cicd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-cicd.png -------------------------------------------------------------------------------- /docs/_static/sdlf-team.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-team.png -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /docs/_static/sdlf-dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-dataset.png -------------------------------------------------------------------------------- /docs/_static/sdlf-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-pipeline.png -------------------------------------------------------------------------------- /docs/_static/sdlf-monitoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-monitoring.png -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-glue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-stage-glue.png -------------------------------------------------------------------------------- /docs/_static/public-references.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/public-references.png -------------------------------------------------------------------------------- /docs/_static/sdlf-foundations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-foundations.png -------------------------------------------------------------------------------- /docs/_static/sdlf-in-a-nutshell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-in-a-nutshell.png -------------------------------------------------------------------------------- /docs/_static/sdlf-pipeline-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-pipeline-full.png -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-stage-lambda.png -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /.cfnlintrc: -------------------------------------------------------------------------------- 1 | ignore_templates: 2 | - "sdlf-utils/workshop-examples/10-demo/sdlf-workshop/*.yaml" 3 | include_checks: 4 | - I 5 | ignore_checks: 6 | - W3002 7 | -------------------------------------------------------------------------------- /docs/_static/sdlf-stage-dataquality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-stage-dataquality.png -------------------------------------------------------------------------------- /docs/_static/sdlf-layers-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-layers-architecture.png -------------------------------------------------------------------------------- /docs/_static/sdlf-architecture-datalake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-architecture-datalake.png -------------------------------------------------------------------------------- /docs/_static/sdlf-architecture-datamesh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-architecture-datamesh.png -------------------------------------------------------------------------------- /docs/_static/sdlf-cicd-gluejobsdeployer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/data-lakes-on-aws/HEAD/docs/_static/sdlf-cicd-gluejobsdeployer.png -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tags" : { 3 | "Framework" : "sdlf", 4 | "sdlf:team" : "iot" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask us a question 4 | title: '' 5 | labels: question 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | Please be as specific as possible 11 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.12" 7 | 8 | mkdocs: 9 | configuration: .mkdocs.yml 10 | fail_on_warning: true 11 | 12 | python: 13 | install: 14 | - requirements: docs/requirements.txt 15 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import DataLakeClient 2 | from .interfaces import DynamoInterface, KMSInterface, S3Interface, SQSInterface, StatesInterface 3 | 4 | __all__ = ["DataLakeClient", "S3Interface", "DynamoInterface", "StatesInterface", "SQSInterface", "KMSInterface"] 5 | -------------------------------------------------------------------------------- /sdlf-stageA/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stageB/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /.cfn-nag-deny-list.yml: -------------------------------------------------------------------------------- 1 | RulesToSuppress: 2 | - id: W76 3 | reason: too experimental. https://stelligent.com/2020/03/27/thought-experiment-proposed-complexity-metric-for-iam-policy-documents/ 4 | - id: W89 5 | reason: SDLF does not support running in VPC by default 6 | - id: W92 7 | reason: ReservedConcurrentExecutions 8 | -------------------------------------------------------------------------------- /sdlf-cicd/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv 22 | -------------------------------------------------------------------------------- /sdlf-monitoring/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-dataquality/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Mac/OSX 9 | .DS_Store 10 | 11 | # Windows 12 | Thumbs.db 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Environments 20 | .env 21 | .venv -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_interface import BaseInterface 2 | from .dynamo_interface import DynamoInterface 3 | from .kms_interface import KMSInterface 4 | from .s3_interface import S3Interface 5 | from .sqs_interface import SQSInterface 6 | from .states_interface import StatesInterface 7 | 8 | __all__ = ["BaseInterface", "S3Interface", "DynamoInterface", "StatesInterface", "SQSInterface", "KMSInterface"] 9 | -------------------------------------------------------------------------------- /sdlf-team/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-dataset/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-foundations/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-stage-glue/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/.gitignore: -------------------------------------------------------------------------------- 1 | # Packaged Templates 2 | output/ 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | 8 | # Python 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | 13 | # Editors 14 | .vscode/ 15 | .idea/ 16 | *.swp 17 | 18 | # Mac/OSX 19 | .DS_Store 20 | 21 | # Windows 22 | Thumbs.db 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # Environments 30 | .env 31 | .venv 32 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/datasets.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: iot team datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: awslabs::sdlf::dataset::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: iot 15 | pDatasetName: legislators 16 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/datasets.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Engineering team datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: awslabs::sdlf::dataset::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: engineering 15 | pDatasetName: legislators 16 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/team-proserve-iot-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: IOT SDLF Team in proserve domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rIot: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: iot 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/team-marketing-industry-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Industry SDLF Team in marketing domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rIndustry: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: industry 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/dataset-legislators.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Example datasets 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rLegislators: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/dataset/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pS3Prefix: legislators 17 | pDeploymentInstance: dev 18 | pStorageDeploymentInstance: dev 19 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/team-datalake-engineering-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Engineering SDLF Team in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rEngineering: 11 | Type: awslabs::sdlf::team::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pTeamName: engineering 15 | pEnvironment: dev 16 | pSNSNotificationsEmail: nobody@amazon.com 17 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-proserve-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in proserve domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rProserve: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: 111111111111 15 | pOrg: forecourt 16 | pDomain: proserve 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/team-datalake-engineering-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering SDLF Team in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rEngineering: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/team/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pTeamName: engineering 17 | pStorageDeploymentInstance: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/foundations-marketing-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in marketing domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMarketing: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: 222222222222 15 | pOrg: forecourt 16 | pDomain: marketing 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/foundations-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: SDLF Foundations in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rForecourt: 11 | Type: awslabs::sdlf::foundations::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pChildAccountId: !Ref "AWS::AccountId" 15 | pOrg: forecourt 16 | pDomain: datalake 17 | pEnvironment: dev 18 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/replicate/src/event-update-table.json: -------------------------------------------------------------------------------- 1 | { 2 | "version":"0", 3 | "id":"1a2ac50f-11dc-111c-09f3-102e0932d2bf", 4 | "detail-type":"Glue Data Catalog Table State Change", 5 | "source":"aws.glue", 6 | "account":"123456789012", 7 | "time":"2020-07-08T12:20:19Z", 8 | "region":"us-east-1", 9 | "resources":[ 10 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/persons" 11 | ], 12 | "detail":{ 13 | "databaseName":"forecourt_datalake_dev_engineering_legislators_db", 14 | "typeOfChange":"UpdateTable", 15 | "tableName":"persons", 16 | "changedPartitions":[ 17 | ] 18 | } 19 | } -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipelines.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering team pipelines 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMain: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./pipeline-main.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | Tags: 19 | - Key: sdlf:pipeline 20 | Value: main 21 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/foundations-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: SDLF Foundations in datalake domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rAnycompany: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/foundations/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pChildAccountId: !Ref AWS::AccountId 17 | pOrg: anycompany 18 | pDomain: datalake 19 | pDeploymentInstance: dev 20 | pCicdRole: Admin 21 | -------------------------------------------------------------------------------- /sdlf-team/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.team" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /sdlf-foundations/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.foundations" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /sdlf-pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.pipeline" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | 12 | packages = [ 13 | { include = "**/*", from = "src", to = "sdlf" }, 14 | ] 15 | 16 | exclude = ["**/*.yaml"] 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.12" 20 | aws-cdk-lib = "^2.159.1" 21 | constructs = ">=10.0.0,<11.0.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: cnfait 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **SDLF release (if known):** 27 | E.g. 1.5.2 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /sdlf-stage-glue/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-glue" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.11.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-lambda" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | #sdlf-pipeline = "^2.11.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-cicd/template-generic-cfn-template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pModuleName: 6 | Description: Name of the module 7 | Type: String 8 | pModuleGitRef: 9 | Description: Git reference (commit id) with the sources of this module version 10 | Type: String 11 | pModuleS3Url: 12 | Description: S3 URL (https) to the module template 13 | Type: String 14 | 15 | Resources: 16 | rCloudFormationModuleSsm: 17 | Type: AWS::SSM::Parameter 18 | Properties: 19 | Name: !Sub /sdlf/${pModuleName}/${pModuleGitRef} 20 | Type: String 21 | Value: !Ref pModuleS3Url 22 | Description: S3 URL (https) to the module template of this module version 23 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-ecsfargate" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.11.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datalake_library import DataLakeClient 4 | from datalake_library.commons import init_logger 5 | 6 | logger = init_logger(__name__) 7 | 8 | 9 | def lambda_handler(event, context): 10 | try: 11 | if isinstance(event, str): 12 | event = json.loads(event) 13 | 14 | client = DataLakeClient(team=event["team"], pipeline=event["pipeline"], stage=event["pipeline_stage"]) 15 | 16 | logger.info("Execution Failed. Sending original payload to DLQ") 17 | client.sqs.send_message_to_fifo_queue(json.dumps(event), "failed", client.sqs.stage_dlq_url) 18 | except Exception as e: 19 | logger.error("Fatal error", exc_info=True) 20 | raise e 21 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.stage-emrserverless" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf/stage" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.12" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | sdlf-pipeline = "^2.11.0" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /sdlf-dataset/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sdlf.dataset" 3 | version = "2.11.0" 4 | description = "AWS Serverless Data Lake Framework" 5 | authors = ["Amazon Web Services"] 6 | license = "MIT-0" 7 | readme = "README.md" 8 | repository = "https://github.com/awslabs/aws-serverless-data-lake-framework/" 9 | documentation = "https://sdlf.readthedocs.io/en/latest/" 10 | 11 | packages = [ 12 | { include = "**/*", from = "src", to = "sdlf" }, 13 | ] 14 | 15 | exclude = ["**/*.yaml"] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.11" 19 | aws-cdk-lib = "^2.159.1" 20 | constructs = ">=10.0.0,<11.0.0" 21 | aws-cdk-aws-glue-alpha = "^2.159.1a0" 22 | aws-cdk-aws-scheduler-alpha = "^2.159.1a0" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/replicate/src/event-create-delete-table.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0", 3 | "id": "0000000-0000-5328-220a-21c060f6c3f4", 4 | "detail-type": "Glue Data Catalog Database State Change", 5 | "source": "aws.glue", 6 | "account": "123456789012", 7 | "time": "2019-01-16T18:08:48Z", 8 | "region": "us-east-1", 9 | "resources": [ 10 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/history", 11 | "arn:aws:glue:us-east-1:123456789012:table/forecourt_datalake_dev_engineering_legislators_db/organizations" 12 | ], 13 | "detail": { 14 | "databaseName": "forecourt_datalake_dev_engineering_legislators_db", 15 | "typeOfChange": "CreateTable", 16 | "changedTables": [ 17 | "history", 18 | "organizations" 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datalake_library import DataLakeClient 4 | from datalake_library.commons import init_logger 5 | 6 | logger = init_logger(__name__) 7 | 8 | 9 | def lambda_handler(event, context): 10 | try: 11 | if isinstance(event, str): 12 | event = json.loads(event) 13 | 14 | client = DataLakeClient( 15 | team=event["body"]["team"], pipeline=event["body"]["pipeline"], stage=event["body"]["pipeline_stage"] 16 | ) 17 | 18 | logger.info("Execution Failed. Sending original payload to DLQ") 19 | client.sqs.send_message_to_fifo_queue(json.dumps(event), "failed", client.sqs.stage_dlq_url) 20 | except Exception as e: 21 | logger.error("Fatal error", exc_info=True) 22 | raise e 23 | -------------------------------------------------------------------------------- /sdlf-cicd/template-glue-job.part: -------------------------------------------------------------------------------- 1 | 2 | r%{BUILDSTEPVARIABLE_NOHYPHEN_AZ}GlueConnection: 3 | Type: AWS::Glue::Connection 4 | Condition: RunInVpc 5 | Metadata: 6 | cfn-lint: 7 | config: 8 | ignore_checks: 9 | - W3010 10 | Properties: 11 | CatalogId: !Ref AWS::AccountId 12 | ConnectionInput: 13 | ConnectionProperties: {} 14 | ConnectionType: NETWORK 15 | Description: "Network connected to the VPC data source" 16 | Name: !Sub sdlf-${pTeamName}-glue-conn-%{BUILDSTEPVARIABLE_NOHYPHEN_AZ} 17 | PhysicalConnectionRequirements: 18 | AvailabilityZone: %{BUILDSTEPVARIABLE_AZ} 19 | SecurityGroupIdList: !Split [",", !ImportValue sdlf-cicd-domain-roles-vpc-security-groups] 20 | SubnetId: %{BUILDSTEPVARIABLE_SUBNET} 21 | -------------------------------------------------------------------------------- /validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | shopt -s globstar 4 | 5 | # python 6 | ruff format --check . 7 | ruff check . 8 | # pylint $(git ls-files --exclude-standard '*.py') # pylint is disabled for now 9 | trivy fs --scanners vuln . 10 | 11 | # shell 12 | find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \ 13 | | xargs -0 shellcheck -x --format gcc 14 | 15 | # cloudformation 16 | cfn-lint ./**/*.yaml 17 | 18 | ## unfortunately cfn_nag doesn't support fn::foreach so we exclude files using it: https://github.com/stelligent/cfn_nag/issues/621 19 | find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \ 20 | | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path 21 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/commons.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional 3 | 4 | from boto3.dynamodb.types import TypeSerializer 5 | 6 | if TYPE_CHECKING: 7 | from mypy_boto3_dynamodb.type_defs import ( 8 | AttributeValueTypeDef, 9 | ) 10 | 11 | 12 | def init_logger(file_name, log_level=None): 13 | if not log_level: 14 | log_level = "INFO" 15 | logging.basicConfig() 16 | logger = logging.getLogger(file_name) 17 | logger.setLevel(getattr(logging, log_level)) 18 | return logger 19 | 20 | 21 | def serialize_dynamodb_item( 22 | item: Mapping[str, Any], serializer: Optional[TypeSerializer] = None 23 | ) -> Dict[str, "AttributeValueTypeDef"]: 24 | serializer = serializer if serializer else TypeSerializer() 25 | return {k: serializer.serialize(v) for k, v in item.items()} 26 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/kms_interface.py: -------------------------------------------------------------------------------- 1 | from .base_interface import BaseInterface 2 | 3 | 4 | class KMSInterface(BaseInterface): 5 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 6 | super().__init__(team, dataset, pipeline, stage, log_level, session) 7 | 8 | def _initialize_client(self): 9 | """Initialize KMS client""" 10 | self.kms = self.session.client("kms", config=self.session_config) 11 | 12 | def _load_config(self): 13 | """Load KMS-specific configuration from SSM""" 14 | self.data_kms_key = self._get_ssm_parameter("/SDLF2/KMS/KeyArn") 15 | if self.team: 16 | self.team_data_kms_key = self._get_ssm_parameter(f"/SDLF/KMS/{self.team}/DataKeyId") 17 | self.team_infra_kms_key = self._get_ssm_parameter(f"/SDLF/KMS/{self.team}/InfraKeyId") 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | extend-exclude = ["sdlf-cicd/sam-translate.py"] 3 | line-length = 120 4 | target-version = "py312" 5 | 6 | [tool.ruff.lint] 7 | extend-select = ["I", "PL", "W"] 8 | ignore = ["PLR0912", "PLR0913", "PLR0915"] 9 | fixable = ["I001", "W291"] 10 | 11 | [tool.pylint.main] 12 | py-version = "3.12" 13 | ignore-paths = ["^/sdlf-cicd/sam-translate.py"] 14 | jobs = 0 15 | 16 | [tool.pylint.format] 17 | max-line-length = 120 18 | max-module-lines = 1500 19 | 20 | [tool.pylint.logging] 21 | # The type of string formatting that logging methods do. `old` means using % 22 | # formatting, `new` is for `{}` formatting. 23 | logging-format-style = "new" 24 | 25 | # Logging modules to check that the string format arguments are in logging 26 | # function parameter format. 27 | logging-modules = ["logging", "datalake_library.commons"] 28 | 29 | [tool.pylint.similarities] 30 | min-similarity-lines = 10 31 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | from datalake_library.sdlf import SQSConfiguration 7 | 8 | logger = init_logger(__name__) 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | if isinstance(event, str): 15 | event = json.loads(event) 16 | 17 | sqs_config = SQSConfiguration(instance=deployment_instance) 18 | sqs_interface = SQSInterface(sqs_config.stage_dlq) 19 | 20 | logger.info("Execution Failed. Sending original payload to DLQ") 21 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 22 | except Exception as e: 23 | logger.error("Fatal error", exc_info=True) 24 | raise e 25 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | from datalake_library.sdlf import SQSConfiguration 7 | 8 | logger = init_logger(__name__) 9 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | if isinstance(event, str): 15 | event = json.loads(event) 16 | 17 | sqs_config = SQSConfiguration(instance=deployment_instance) 18 | sqs_interface = SQSInterface(sqs_config.stage_dlq) 19 | 20 | logger.info("Execution Failed. Sending original payload to DLQ") 21 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 22 | except Exception as e: 23 | logger.error("Fatal error", exc_info=True) 24 | raise e 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/support-the-sdlf.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Support the SDLF 3 | about: Add your organisation's name or logo to the SDLF GitHub read.me 4 | title: "[Support the SDLF]: " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Thank you for letting us use your organisation's name on the SDLF read.me page and letting other customers know that you support the project! If you would like us to also display your organisation's logo. please raise a linked pull request to provide an image file for the logo. 11 | 12 | Please add any files to *docs/source/_static/* 13 | 14 | Organisation Name: 15 | Your Name: 16 | Your Position: 17 | I have included a logo: y/n 18 | 19 | *By raising a Support the SDLF issue (and related pull request), you are granting AWS permission to use your company’s name (and logo) for the limited purpose described here and you are confirming that you have authority to grant such permission.* 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import DataLakeClient 4 | from datalake_library.commons import init_logger 5 | 6 | logger = init_logger(__name__) 7 | 8 | 9 | def lambda_handler(event, context): 10 | try: 11 | client = DataLakeClient(team=os.environ["TEAM"], pipeline=os.environ["PIPELINE"], stage=os.environ["STAGE"]) 12 | 13 | messages = client.sqs.receive_messages(1, client.sqs.stage_dlq_url) 14 | if not messages: 15 | logger.info("No messages found in DLQ") 16 | return 17 | 18 | logger.info("Received {} messages".format(len(messages))) 19 | for message in messages: 20 | client.sqs.send_message_to_fifo_queue(message["Body"], "redrive", client.sqs.stage_queue_url) 21 | logger.info("Redrive message succeeded") 22 | except Exception as e: 23 | logger.error("Fatal error", exc_info=True) 24 | raise e 25 | return 26 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-proserve-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: proserve data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rProserve: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-proserve-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rIot: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rProserve 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-proserve-iot-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: iot 31 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/client.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | from .interfaces import DynamoInterface, KMSInterface, S3Interface, SQSInterface, StatesInterface 4 | 5 | 6 | class DataLakeClient: 7 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 8 | """ 9 | Unified client for all data lake operations with shared boto3 session 10 | """ 11 | # Shared session across all interfaces 12 | self.session = session or boto3.Session() 13 | 14 | # Initialize all interfaces with shared session 15 | self.s3 = S3Interface(team, dataset, pipeline, stage, log_level, self.session) 16 | self.dynamo = DynamoInterface(team, dataset, pipeline, stage, log_level, self.session) 17 | self.states = StatesInterface(team, dataset, pipeline, stage, log_level, self.session) 18 | self.sqs = SQSInterface(team, dataset, pipeline, stage, log_level, self.session) 19 | self.kms = KMSInterface(team, dataset, pipeline, stage, log_level, self.session) 20 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main/datadomain-marketing-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: marketing data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMarketing: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-marketing-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rIndustry: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rMarketing 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-marketing-industry-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: industry 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main/datadomain-datalake-dev.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: datalake data domain, dev environment 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rForecourt: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./foundations-datalake-dev.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | 19 | rEngineering: 20 | Type: AWS::CloudFormation::Stack 21 | DependsOn: rForecourt 22 | DeletionPolicy: Delete 23 | UpdateReplacePolicy: Delete 24 | Properties: 25 | TemplateURL: ./team-datalake-engineering-dev.yaml 26 | Parameters: 27 | pPipelineReference: !Ref pPipelineReference 28 | Tags: 29 | - Key: sdlf:team 30 | Value: engineering 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipelines.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: iot team pipelines 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMain: 11 | Type: AWS::CloudFormation::Stack 12 | DeletionPolicy: Delete 13 | UpdateReplacePolicy: Delete 14 | Properties: 15 | TemplateURL: ./pipeline-main.yaml 16 | Parameters: 17 | pPipelineReference: !Ref pPipelineReference 18 | Tags: 19 | - Key: sdlf:pipeline 20 | Value: main 21 | rSingleStage: 22 | Type: AWS::CloudFormation::Stack 23 | DeletionPolicy: Delete 24 | UpdateReplacePolicy: Delete 25 | Properties: 26 | TemplateURL: ./pipeline-singlestage.yaml 27 | Parameters: 28 | pPipelineReference: !Ref pPipelineReference 29 | Tags: 30 | - Key: sdlf:pipeline 31 | Value: singlestage 32 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | 8 | logger = init_logger(__name__) 9 | team = os.environ["TEAM"] 10 | dataset = os.environ["DATASET"] 11 | pipeline = os.environ["PIPELINE"] 12 | pipeline_stage = os.environ["PIPELINE_STAGE"] 13 | org = os.environ["ORG"] 14 | domain = os.environ["DOMAIN"] 15 | env = os.environ["ENV"] 16 | 17 | 18 | def lambda_handler(event, context): 19 | try: 20 | if isinstance(event, str): 21 | event = json.loads(event) 22 | 23 | sqs_config = SQSConfiguration(team, pipeline, pipeline_stage) 24 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 25 | 26 | logger.info("Execution Failed. Sending original payload to DLQ") 27 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 28 | except Exception as e: 29 | logger.error("Fatal error", exc_info=True) 30 | raise e 31 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/error/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | 8 | logger = init_logger(__name__) 9 | team = os.environ["TEAM"] 10 | dataset = os.environ["DATASET"] 11 | pipeline = os.environ["PIPELINE"] 12 | pipeline_stage = os.environ["PIPELINE_STAGE"] 13 | org = os.environ["ORG"] 14 | domain = os.environ["DOMAIN"] 15 | env = os.environ["ENV"] 16 | 17 | 18 | def lambda_handler(event, context): 19 | try: 20 | if isinstance(event, str): 21 | event = json.loads(event) 22 | 23 | sqs_config = SQSConfiguration(team, pipeline, pipeline_stage) 24 | sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) 25 | 26 | logger.info("Execution Failed. Sending original payload to DLQ") 27 | sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed") 28 | except Exception as e: 29 | logger.error("Fatal error", exc_info=True) 30 | raise e 31 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-singlestage.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Single stage pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rSingleA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: singlestage 16 | pTeamName: iot 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF2/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "iot/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.interfaces.sqs_interface import SQSInterface 5 | from datalake_library.sdlf import SQSConfiguration 6 | 7 | logger = init_logger(__name__) 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 9 | 10 | 11 | def lambda_handler(event, context): 12 | try: 13 | sqs_config = SQSConfiguration(instance=deployment_instance) 14 | dlq_interface = SQSInterface(sqs_config.stage_dlq) 15 | messages = dlq_interface.receive_messages(1) 16 | if not messages: 17 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 18 | return 19 | 20 | logger.info("Received {} messages".format(len(messages))) 21 | queue_interface = SQSInterface(sqs_config.stage_queue) 22 | for message in messages: 23 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 24 | logger.info("Redrive message succeeded") 25 | except Exception as e: 26 | logger.error("Fatal error", exc_info=True) 27 | raise e 28 | return 29 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"]) 13 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 14 | messages = dlq_interface.receive_messages(1) 15 | if not messages: 16 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 17 | return 18 | 19 | logger.info("Received {} messages".format(len(messages))) 20 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 21 | for message in messages: 22 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 23 | logger.info("Redrive message succeeded") 24 | except Exception as e: 25 | logger.error("Fatal error", exc_info=True) 26 | raise e 27 | return 28 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"]) 13 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 14 | messages = dlq_interface.receive_messages(1) 15 | if not messages: 16 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 17 | return 18 | 19 | logger.info("Received {} messages".format(len(messages))) 20 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 21 | for message in messages: 22 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 23 | logger.info("Redrive message succeeded") 24 | except Exception as e: 25 | logger.error("Fatal error", exc_info=True) 26 | raise e 27 | return 28 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/template-lambda-layer.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy Lambda Layer 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pLayerName: 9 | Description: Name of the lambda layer 10 | Type: String 11 | AllowedPattern: "^[a-zA-Z0-9]*$" 12 | pGitRef: 13 | Description: Git reference (commit id) with the sources of these layers 14 | Type: String 15 | 16 | Resources: 17 | rDatalakeLibraryLambdaLayer: 18 | Type: AWS::Lambda::LayerVersion 19 | Properties: 20 | CompatibleRuntimes: 21 | - python3.12 22 | Content: 23 | S3Bucket: !Ref pArtifactsBucket 24 | S3Key: !Sub sdlf/layers/${pLayerName}-${pGitRef}.zip 25 | Description: !Sub ${pLayerName} Lambda Layer 26 | LayerName: !Sub "sdlf-${pLayerName}" 27 | 28 | rDatalakeLibraryLambdaLayerSsm: 29 | Type: AWS::SSM::Parameter 30 | Properties: 31 | Name: !Sub "/SDLF/Lambda/Latest${pLayerName}Layer" 32 | Type: String 33 | Value: !Ref rDatalakeLibraryLambdaLayer 34 | Description: !Sub The ARN of the latest version of the ${pLayerName} layer 35 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.configuration.resource_configs import SQSConfiguration 5 | from datalake_library.interfaces.sqs_interface import SQSInterface 6 | 7 | logger = init_logger(__name__) 8 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 9 | 10 | 11 | def lambda_handler(event, context): 12 | try: 13 | sqs_config = SQSConfiguration(instance=deployment_instance) 14 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 15 | messages = dlq_interface.receive_messages(1) 16 | if not messages: 17 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 18 | return 19 | 20 | logger.info("Received {} messages".format(len(messages))) 21 | queue_interface = SQSInterface(sqs_config.get_stage_queue_name) 22 | for message in messages: 23 | queue_interface.send_message_to_fifo_queue(message["Body"], "redrive") 24 | logger.info("Redrive message succeeded") 25 | except Exception as e: 26 | logger.error("Fatal error", exc_info=True) 27 | raise e 28 | return 29 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library import DataLakeClient 5 | from datalake_library.commons import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | team = os.environ["TEAM"] 13 | pipeline = os.environ["PIPELINE"] 14 | stage = os.environ["STAGE"] 15 | 16 | client = DataLakeClient(team=team, pipeline=pipeline, stage=stage) 17 | 18 | messages = client.sqs.receive_messages(1, client.sqs.stage_dlq_url) 19 | if not messages: 20 | logger.info("No messages found in DLQ") 21 | return 22 | 23 | logger.info("Received {} messages".format(len(messages))) 24 | for message in messages: 25 | logger.info("Starting State Machine Execution") 26 | if isinstance(message["Body"], str): 27 | response = json.loads(message["Body"]) 28 | client.states.run_state_machine(client.states.state_machine_arn, response) 29 | logger.info("Redrive message succeeded") 30 | except Exception as e: 31 | logger.error("Fatal error", exc_info=True) 32 | raise e 33 | return 34 | -------------------------------------------------------------------------------- /.mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AWS Serverless Data Lake Framework 2 | site_url: https://sdlf.readthedocs.io 3 | repo_url: https://github.com/awslabs/aws-serverless-data-lake-framework 4 | copyright: Amazon Web Services, Inc. All Rights Reserved. 5 | theme: 6 | name: material 7 | features: 8 | - navigation.tabs 9 | - navigation.tabs.sticky 10 | - toc.integrate 11 | - navigation.indexes 12 | - navigation.path 13 | logo: _static/sdlf-logo.svg 14 | favicon: _static/sail-icon.ico 15 | markdown_extensions: 16 | - admonition 17 | - attr_list 18 | - tables 19 | plugins: 20 | - search 21 | nav: 22 | - index.md 23 | - architecture.md 24 | - Constructs: 25 | - constructs/index.md 26 | - constructs/foundations.md 27 | - constructs/team.md 28 | - constructs/dataset.md 29 | - constructs/pipeline.md 30 | - constructs/stage-lambda.md 31 | - constructs/stage-glue.md 32 | - constructs/stage-dataquality.md 33 | - constructs/monitoring.md 34 | - constructs/cicd.md 35 | - 'Workshop': 'https://sdlf.workshop.aws/' 36 | - 'License': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/LICENSE' 37 | - 'Contributing': 'https://github.com/awslabs/aws-serverless-data-lake-framework/blob/main/CONTRIBUTING.md' 38 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/catalog-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | dlq_name = os.environ["DLQ"] 9 | queue_name = os.environ["QUEUE"] 10 | sqs_endpoint_url = "https://sqs." + os.getenv("AWS_REGION") + ".amazonaws.com" 11 | sqs = boto3.client("sqs", endpoint_url=sqs_endpoint_url) 12 | 13 | 14 | def lambda_handler(event, context): 15 | try: 16 | dlq_queue_url = sqs.get_queue_url(QueueName=dlq_name)["QueueUrl"] 17 | queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"] 18 | 19 | messages = sqs.receive_message(QueueUrl=dlq_queue_url, MaxNumberOfMessages=1, WaitTimeSeconds=1)["Messages"] 20 | if len(messages) == 0 or messages is None: 21 | logger.info("No messages found in {}".format(dlq_name)) 22 | return 23 | 24 | logger.info("Received {} messages".format(len(messages))) 25 | for message in messages: 26 | sqs.send_message(QueueUrl=queue_url, MessageBody=message["Body"]) 27 | sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"]) 28 | logger.info("Delete message succeeded") 29 | except Exception as e: 30 | logger.error("Fatal error", exc_info=True) 31 | raise e 32 | return 33 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/states_interface.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import date, datetime 3 | 4 | from .base_interface import BaseInterface 5 | 6 | 7 | class StatesInterface(BaseInterface): 8 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 9 | super().__init__(team, dataset, pipeline, stage, log_level, session) 10 | 11 | def _initialize_client(self): 12 | """Initialize Step Functions client""" 13 | self.stepfunctions = self.session.client("stepfunctions", config=self.session_config) 14 | 15 | def _load_config(self): 16 | """Load Step Functions-specific configuration from SSM""" 17 | if self.team and self.stage and self.pipeline: 18 | self.state_machine_arn = self._get_ssm_parameter(f"/SDLF/SM/{self.team}/{self.pipeline}{self.stage}SM") 19 | 20 | @staticmethod 21 | def json_serial(obj): 22 | """JSON serializer for objects not serializable by default""" 23 | if isinstance(obj, (datetime, date)): 24 | return obj.isoformat() 25 | raise TypeError("Type %s not serializable" % type(obj)) 26 | 27 | def run_state_machine(self, machine_arn, message): 28 | return self.stepfunctions.start_execution( 29 | stateMachineArn=machine_arn, input=json.dumps(message, default=self.json_serial) 30 | ) 31 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/buildspec.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CFN_ENDPOINT="https://cloudformation.$AWS_REGION.amazonaws.com" 4 | 5 | pip uninstall -y aws-sam-cli && unzip -q aws-sam-cli-linux-x86_64.zip -d sam-installation 6 | ./sam-installation/install && sam --version 7 | pip install "cfn-lint<1" cloudformation-cli 8 | 9 | # removing everything up to the first hyphen, then anything that isn't a letter/number, and lower-casing what's left 10 | module_name_without_prefix="${SDLF_CONSTRUCT#*-}" 11 | module_name_alnum="${module_name_without_prefix//[^[:alnum:]]/}" 12 | MODULE="${module_name_alnum,,}" 13 | MODULE="DatalakeLibrary" # TODO 14 | 15 | mkdir artifacts 16 | zip -r artifacts/datalake_library.zip ./python -x \*__pycache__\* 17 | LAYER_HASH="$(sha256sum artifacts/datalake_library.zip | cut -c1-12)" 18 | aws s3api put-object --bucket "$ARTIFACTS_BUCKET" \ 19 | --key "sdlf/layers/$MODULE-$LAYER_HASH.zip" \ 20 | --body artifacts/datalake_library.zip 21 | 22 | STACK_NAME="sdlf-lambdalayers-$MODULE" 23 | aws cloudformation --endpoint-url "$CFN_ENDPOINT" deploy \ 24 | --stack-name "$STACK_NAME" \ 25 | --template-file ./template-lambda-layer.yaml \ 26 | --parameter-overrides \ 27 | pArtifactsBucket="$ARTIFACTS_BUCKET" \ 28 | pLayerName="$MODULE" \ 29 | pGitRef="$LAYER_HASH" \ 30 | --tags Framework=sdlf \ 31 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" || exit 1 32 | 33 | echo "done" 34 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/dynamo_interface.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | from boto3.dynamodb.types import TypeSerializer 4 | 5 | from ..commons import serialize_dynamodb_item 6 | from .base_interface import BaseInterface 7 | 8 | 9 | class DynamoInterface(BaseInterface): 10 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 11 | super().__init__(team, dataset, pipeline, stage, log_level, session) 12 | 13 | def _initialize_client(self): 14 | # DynamoDB specific client 15 | self.dynamodb = self.session.client("dynamodb", config=self.session_config) 16 | 17 | def _load_config(self): 18 | """Load DynamoDB-specific configuration from SSM""" 19 | self.object_metadata_table = self._get_ssm_parameter("/SDLF2/Dynamo/ObjectCatalog") 20 | self.manifests_table = self._get_ssm_parameter("/SDLF2/Dynamo/Manifests") 21 | 22 | @staticmethod 23 | def build_id(bucket, key): 24 | return f"s3://{bucket}/{key}" 25 | 26 | def put_item(self, table, item): 27 | serializer = TypeSerializer() 28 | self.dynamodb.put_item(TableName=table, Item=serialize_dynamodb_item(item, serializer)) 29 | 30 | def update_object_metadata_catalog(self, item): 31 | item["id"] = self.build_id(item["bucket"], item["key"]) 32 | item["timestamp"] = int(round(dt.datetime.now(dt.UTC).timestamp() * 1000, 0)) 33 | return self.put_item(self.object_metadata_table, item) 34 | -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/stage-redrive/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import SQSConfiguration, StateMachineConfiguration 6 | from datalake_library.interfaces.sqs_interface import SQSInterface 7 | from datalake_library.interfaces.states_interface import StatesInterface 8 | 9 | logger = init_logger(__name__) 10 | 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | team = os.environ["TEAM"] 15 | pipeline = os.environ["PIPELINE"] 16 | stage = os.environ["STAGE"] 17 | state_config = StateMachineConfiguration(team, pipeline, stage) 18 | sqs_config = SQSConfiguration(team, pipeline, stage) 19 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 20 | 21 | messages = dlq_interface.receive_messages(1) 22 | if not messages: 23 | logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name)) 24 | return 25 | 26 | logger.info("Received {} messages".format(len(messages))) 27 | for message in messages: 28 | logger.info("Starting State Machine Execution") 29 | if isinstance(message["Body"], str): 30 | response = json.loads(message["Body"]) 31 | StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response) 32 | logger.info("Redrive message succeeded") 33 | except Exception as e: 34 | logger.error("Fatal error", exc_info=True) 35 | raise e 36 | return 37 | -------------------------------------------------------------------------------- /sdlf-cicd/template-generic-cfn-module.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pLibraryOrg: 9 | Description: Name of the org (all lowercase, no symbols or spaces) 10 | Type: String 11 | pLibraryFramework: 12 | Description: Name of the framework (all lowercase, no symbols or spaces) 13 | Type: String 14 | pLibraryModule: 15 | Description: Name of the module 16 | Type: String 17 | pModuleGitRef: 18 | Description: Git reference (commit id) with the sources of this module version 19 | Type: String 20 | 21 | Resources: 22 | rCloudFormationModule: 23 | Type: AWS::CloudFormation::ModuleVersion 24 | Properties: 25 | ModuleName: !Sub "${pLibraryOrg}::${pLibraryFramework}::${pLibraryModule}::MODULE" 26 | ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pLibraryOrg}/${pLibraryFramework}/${pLibraryModule}-${pModuleGitRef}.zip" 27 | 28 | rCloudFormationModuleDefaultVersion: 29 | Type: AWS::CloudFormation::ModuleDefaultVersion 30 | Properties: 31 | Arn: !Ref rCloudFormationModule 32 | 33 | rCloudFormationModuleSsm: 34 | Type: AWS::SSM::Parameter 35 | DependsOn: rCloudFormationModuleDefaultVersion 36 | Properties: 37 | Name: !Sub /SDLF/CFN/${pLibraryOrg}-${pLibraryFramework}-${pLibraryModule}-MODULE 38 | Type: String 39 | Value: !Ref pModuleGitRef 40 | Description: Git reference (commit id) with the sources of this module version -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-pipeline.drawio: -------------------------------------------------------------------------------- 1 | 7VrbctowEP0aZtqHMMgyt0duucyQPpTOJHliNLawXWzLlUWAfn1XWAZsCZJ2uCZ9CdZKtrTnHO1q7VRwL1rccZL4j8ylYcWquYsK7lcsq2E34K80LDNDE6HM4PHAzUxbhlHwmypjTVlngUvTwkDBWCiCpGh0WBxTRxRshHM2Lw6bsLA4a0I8qhlGDgl161PgCl9ZUaO96bingeerqVtWM+uISD5YeZL6xGXzLRMeVHCPMyayq2jRo6HELsclu+92R+96YZzG4j03kF/93qBPX57Hj+i50ZxMn6f3N1b2lFcSzpTDarFimSPA2Sx2qXxIrYK7cz8QdJQQR/bOgXKw+SIKoYXgchKEYY+FjEM7ZjEM6qoZKBd0sXPpaA0ICImyiAq+hCHqhmY9u0NpyMIK0vmGEbumbP4WGXZTGYlSgbd+9AYouFBY/QVuWMNt1B/eSqeChIYBOF6GEbwXRaxSwdmUltAyAEjCwIuh6QBaFOxdiWUAEu2ojihwXTmNkZwifUcgA+lktAxcWMeiwtao6DyNwDASNIGf21nsiIDFqVwi49NJCJuwTA6bCUlabx1EJFYeJ24AoJTo2GLQhPeExUKFMWTlbTWTfCqEgUReRwtPBswqmad21QOWktWUDxDIjL1juByn4NN4kns0XvtTFlPFwnW73m3YZkWVBSRYsqWzkE6kUlPwK4i94arVxzXli2kKl6T+ISWGyhqrtzWNWaYNj9roSCpr6RteQOjvgO0LrXpV+GXc8SnwQASVYnOJIDIPsFcaSb8h+XAWwc/3ztMqi0mN/ujcDb5+3FiBaqhajt06ldiunzBe5PFKj92pZPRCyeBMELnpoXnTPhQ7Fi5wgw3bzBTK0bGosfS0qtFBXTifqSbjwmcei0k42FhLwG3GDJkMcysGf1IhlipKkxlsxQK/2Zxyov3AwrrYjDt0n9hUdhKEe1Tsc90yU8VpCLy/Fldy+D2hJ1GVPpFGwAbeXSnw6OfBdXDIhWs4gyBkOoS0j6Xc5pULNd9pbwu1cU6h5svUhbqvcLkcoRoi7GmFihq7s5/E8SKT30GYaBXPIe0zly2WgYhMyfgalGyfPeTiugbg/8LvgxV+tmGXnrjww6Z9CnVC9x8qv1W5l9d+nW+d4cuPh97oA9d/TVwq/9Zv5Aqv7tAJ4y5u7k6An7v8q78zJR6t/MP6SxaNjus6VecvO948Vds7qDrNqTpf5rWUf6hdFK6NznwWsWsaTtcl1BzBt4WKzynUfJnXUv6VhWqIsCcWqqF+/hTlH7aKTFh6yDjtVyvDF8RLLv9KSm4cMeRCc/NVfNW39a8FePAH -------------------------------------------------------------------------------- /docs/constructs/monitoring.md: -------------------------------------------------------------------------------- 1 | # sdlf-monitoring 2 | 3 | !!! note 4 | `sdlf-monitoring` is defined in the [sdlf-monitoring](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-monitoring) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Monitoring](../_static/sdlf-monitoring.png) 9 | 10 | CloudTrail (Auditing) and S3 Storage Lens are resources implemented in the framework. They are deployed once only and are consumed by all systems and users across the lake. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rProserveMonitoring: 20 | Type: awslabs::sdlf::monitoring::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pCloudtrailEnabled: true 24 | ``` 25 | 26 | ## Interface 27 | 28 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-monitoring` publishes the following parameters: 29 | 30 | | SSM Parameter | Description | Comment | 31 | |-----------------------------| ---------------------------------------------------------------- | -------------------------------------------- | 32 | | `/SDLF/S3/CloudTrailBucket` | Name of CloudTrail S3 bucket | | 33 | 34 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-routing/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library import DataLakeClient 5 | from datalake_library.commons import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def lambda_handler(event, context): 11 | try: 12 | logger.info("Received {} messages".format(len(event["Records"]))) 13 | for record in event["Records"]: 14 | logger.info("Starting State Machine Execution") 15 | event_body = json.loads(record["body"]) 16 | object_key = event_body["object"]["key"].split("/") 17 | team = object_key[0] 18 | dataset = object_key[1] 19 | pipeline = os.environ["PIPELINE"] 20 | pipeline_stage = os.environ["PIPELINE_STAGE"] 21 | org = os.environ["ORG"] 22 | domain = os.environ["DOMAIN"] 23 | env = os.environ["ENV"] 24 | 25 | event_with_pipeline_details = { 26 | **event_body["object"], 27 | "bucket": event_body["bucket"]["name"], 28 | "team": team, 29 | "dataset": dataset, 30 | "pipeline": pipeline, 31 | "pipeline_stage": pipeline_stage, 32 | "org": org, 33 | "domain": domain, 34 | "env": env, 35 | } 36 | 37 | client = DataLakeClient(team=team, pipeline=pipeline, stage=pipeline_stage) 38 | client.states.run_state_machine(client.states.state_machine_arn, event_with_pipeline_details) 39 | except Exception as e: 40 | logger.error("Fatal error", exc_info=True) 41 | raise e 42 | -------------------------------------------------------------------------------- /sdlf-cicd/template-cfn-module.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Deploy a CloudFormation module 3 | 4 | Parameters: 5 | pArtifactsBucket: 6 | Description: The artifacts bucket used by CodeBuild and CodePipeline 7 | Type: String 8 | pEnvironment: 9 | Description: Environment name 10 | Type: String 11 | AllowedValues: [dev, test, prod] 12 | pDomain: 13 | Description: Name of the data domain (all lowercase, no symbols or spaces) 14 | Type: String 15 | pTeamName: 16 | Description: Name of the team (all lowercase, no symbols or spaces) 17 | Type: String 18 | pModuleName: 19 | Description: Name of the module 20 | Type: String 21 | pModuleGitRef: 22 | Description: Git reference (commit id) with the sources of this module version 23 | Type: String 24 | 25 | Resources: 26 | rCloudFormationModule: 27 | Type: AWS::CloudFormation::ModuleVersion 28 | Properties: 29 | ModuleName: !Sub "${pDomain}::${pTeamName}::${pModuleName}::MODULE" 30 | ModulePackage: !Sub "s3://${pArtifactsBucket}/modules/${pDomain}/${pEnvironment}/${pTeamName}/${pModuleName}-${pModuleGitRef}.zip" 31 | 32 | rCloudFormationModuleDefaultVersion: 33 | Type: AWS::CloudFormation::ModuleDefaultVersion 34 | Properties: 35 | Arn: !Ref rCloudFormationModule 36 | 37 | rCloudFormationModuleSsm: 38 | Type: AWS::SSM::Parameter 39 | DependsOn: rCloudFormationModuleDefaultVersion 40 | Properties: 41 | Name: !Sub /SDLF/CFN/${pDomain}-${pTeamName}-${pModuleName}-MODULE 42 | Type: String 43 | Value: !Ref pModuleGitRef 44 | Description: Git reference (commit id) with the sources of this module version 45 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | SDLF supports both a centralized datalake deployment pattern and decentralized data domains which could be used as a basis for a [data mesh](https://aws.amazon.com/what-is/data-mesh/) deployment pattern. 4 | 5 | ## Centralized Data Lake 6 | 7 | ![Centralized Data Lake Architecture](_static/sdlf-architecture-datalake.png) 8 | 9 | !!! warning 10 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 11 | 12 | ## Data Mesh 13 | 14 | The Data Mesh pattern is fundamentally about decentralized data ownership, with data owned by specialized domain teams rather than a centralized data team. This usually means: 15 | 16 | - each data domain team has its own dedicated data infrastructure, for production and/or consumption 17 | - each data domain team is able to deploy the tools and infrastructure it needs - a self-serve data platform 18 | 19 | A governance layer is federating data assets in a business catalog to ensure compliance against policies and standards, and ease of data sharing across teams. 20 | 21 | As such, it can be seen as a collection of data domain-specific datalakes deployed with SDLF. [Amazon SageMaker Data and AI Governance](https://aws.amazon.com/sagemaker/data-ai-governance/) (built on Amazon DataZone) can be used for the governance layer. 22 | 23 | ![Data Mesh Architecture](_static/sdlf-architecture-datamesh.png) 24 | 25 | !!! warning 26 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 27 | 28 | ## Transactional Data Lake 29 | 30 | Using [Iceberg](https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/introduction.html). 31 | 32 | !!! warning 33 | We strongly recommend that customers conduct a [Well Architected Review](https://aws.amazon.com/architecture/well-architected/) of their SDLF implementation. 34 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-deployment/sdlf-main-datalake-engineering/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Engineering team Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: main 16 | pTeamName: engineering 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF2/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "engineering/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | 33 | rMainB: 34 | Type: awslabs::sdlf::stageB::MODULE 35 | Properties: 36 | pPipelineReference: !Ref pPipelineReference 37 | pDatasetBucket: "{{resolve:ssm:/SDLF2//S3/StageBucket}}" 38 | pStageName: B 39 | pPipeline: main 40 | pTeamName: engineering 41 | pTriggerType: schedule 42 | pEventPattern: !Sub >- 43 | { 44 | "source": ["aws.states"], 45 | "detail-type": ["Step Functions Execution Status Change"], 46 | "detail": { 47 | "status": ["SUCCEEDED"], 48 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-engineering-main-sm-A"] 49 | } 50 | } 51 | pSchedule: "cron(*/5 * * * ? *)" 52 | pEnableTracing: false 53 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/20-production/sdlf-main-proserve-iot/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: awslabs::sdlf::stageA::MODULE 12 | Properties: 13 | pPipelineReference: !Ref pPipelineReference 14 | pStageName: A 15 | pPipeline: main 16 | pTeamName: iot 17 | pTriggerType: event 18 | pEventPattern: >- 19 | { 20 | "source": ["aws.s3"], 21 | "detail-type": ["Object Created"], 22 | "detail": { 23 | "bucket": { 24 | "name": ["{{resolve:ssm:/SDLF2/S3/RawBucket}}"] 25 | }, 26 | "object": { 27 | "key": [{ "prefix": "iot/legislators/" }] 28 | } 29 | } 30 | } 31 | pEnableTracing: false 32 | 33 | rMainB: 34 | Type: awslabs::sdlf::stageB::MODULE 35 | Properties: 36 | pPipelineReference: !Ref pPipelineReference 37 | pDatasetBucket: "{{resolve:ssm:/SDLF2//S3/StageBucket}}" 38 | pStageName: B 39 | pPipeline: main 40 | pTeamName: iot 41 | pTriggerType: schedule 42 | pEventPattern: !Sub >- 43 | { 44 | "source": ["aws.states"], 45 | "detail-type": ["Step Functions Execution Status Change"], 46 | "detail": { 47 | "status": ["SUCCEEDED"], 48 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 49 | } 50 | } 51 | pSchedule: "cron(*/5 * * * ? *)" 52 | pEnableTracing: false 53 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Example 2 | 3 | This example demonstrates how to hydrate the data lake with a sample dataset for a specific team. 4 | 5 | ## Usage 6 | 7 | ```bash 8 | ./deploy.sh -t -d -r [-p ] 9 | ``` 10 | 11 | ### Parameters 12 | 13 | - `-t ` (required): Team name (2-12 characters, lowercase letters and numbers only) 14 | - `-d ` (required): Dataset name (2-12 characters, lowercase letters and numbers only) 15 | - `-r ` (required): AWS region (e.g., us-east-1, eu-west-1) 16 | - `-p ` (optional): AWS profile to use 17 | - `-h`: Show help message 18 | 19 | ### Examples 20 | 21 | ```bash 22 | # Deploy legislators dataset for team "engineering" in us-east-1 23 | ./deploy.sh -t engineering -d legislators -r us-east-1 24 | 25 | # Deploy customers dataset for team "analytics" in eu-west-1 with specific AWS profile 26 | ./deploy.sh -t analytics -d customers -r eu-west-1 -p my-profile 27 | ``` 28 | 29 | ## What it creates 30 | 31 | 1. **Glue Job**: `sdlf---glue-job` 32 | 2. **IAM Role**: `sdlf---glue-role` 33 | 3. **CloudFormation Stack**: `sdlf---glue-job` 34 | 4. **S3 Data**: Uploads sample data to `s3://///` 35 | 36 | ## Data Processing 37 | 38 | The Glue job processes three JSON files: 39 | - `persons.json` → `persons/` (Parquet) 40 | - `memberships.json` → `memberships/` (Parquet) 41 | - `organizations.json` → `organizations/` (Parquet) 42 | - Creates a joined `history/` dataset partitioned by organization name 43 | 44 | ## Prerequisites 45 | 46 | - AWS CLI configured 47 | - SDLF framework deployed in the specified region 48 | - Appropriate IAM permissions 49 | 50 | ## Example Resource Names 51 | 52 | For team "engineering" and dataset "legislators": 53 | - Glue Job: `sdlf-engineering-legislators-glue-job` 54 | - IAM Role: `sdlf-engineering-legislators-glue-role` 55 | - S3 Path: `s3:///engineering/legislators/` 56 | -------------------------------------------------------------------------------- /sdlf-cicd/lambda/stagesrepositories-cicd/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | 9 | codecommit_endpoint_url = "https://codecommit." + os.getenv("AWS_REGION") + ".amazonaws.com" 10 | codecommit = boto3.client("codecommit", endpoint_url=codecommit_endpoint_url) 11 | codepipeline_endpoint_url = "https://codepipeline." + os.getenv("AWS_REGION") + ".amazonaws.com" 12 | codepipeline = boto3.client("codepipeline", endpoint_url=codepipeline_endpoint_url) 13 | 14 | 15 | def lambda_handler(event, context): 16 | try: 17 | sdlf_stage_repositories = [] 18 | next_token = None 19 | while True: 20 | if next_token: 21 | response = codecommit.list_repositories(nextToken=next_token) 22 | else: 23 | response = codecommit.list_repositories() 24 | repos = response["repositories"] 25 | sdlf_stage_repositories.extend( 26 | [ 27 | repo["repositoryName"] 28 | for repo in repos 29 | if repo["repositoryName"].startswith(os.getenv("STAGES_REPOSITORIES_PREFIX")) 30 | ] 31 | ) 32 | next_token = response.get("nextToken") 33 | if not next_token: 34 | break 35 | 36 | logger.info("sdlf_stage_repositories: %s", sdlf_stage_repositories) 37 | 38 | except Exception as e: 39 | message = "Function exception: " + str(e) 40 | codepipeline.put_job_failure_result( 41 | jobId=event["CodePipeline.job"]["id"], 42 | failureDetails={"message": message, "type": "JobFailed"}, 43 | ) 44 | raise 45 | 46 | codepipeline.put_job_success_result( 47 | jobId=event["CodePipeline.job"]["id"], 48 | outputVariables={ 49 | "StagesRepositories": ",".join(sdlf_stage_repositories), 50 | "StagesRepositoriesCount": ",".join(list(map(str, range(0, len(sdlf_stage_repositories))))), 51 | }, 52 | ) 53 | return "Success" 54 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/base_interface.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import boto3 4 | from botocore.client import Config 5 | from botocore.exceptions import ClientError 6 | 7 | from ..commons import init_logger 8 | 9 | 10 | class BaseInterface: 11 | """Simplified base interface for AWS service interactions""" 12 | 13 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 14 | # Simple properties - no config wrapper needed 15 | self.team = team 16 | self.dataset = dataset 17 | self.pipeline = pipeline 18 | self.stage = stage 19 | self.log_level = log_level or os.getenv("LOG_LEVEL", "INFO") 20 | 21 | # Shared session and logger 22 | self.session = session or boto3.Session() 23 | self.logger = init_logger(__name__, self.log_level) 24 | 25 | # Common session config 26 | self.session_config = Config(user_agent="awssdlf/2.11.0") 27 | 28 | # SSM client for parameter reads (belongs in interface layer) 29 | self.ssm = self.session.client("ssm", config=self.session_config) 30 | 31 | # Initialize service-specific clients and config 32 | self._initialize_client() 33 | self._load_config() 34 | 35 | def _initialize_client(self): 36 | """Override in subclasses to initialize service-specific boto3 clients""" 37 | pass 38 | 39 | def _load_config(self): 40 | """Override in subclasses to load service-specific configuration from SSM""" 41 | pass 42 | 43 | def _get_ssm_parameter(self, parameter_name): 44 | """Get SSM parameter value - interface responsibility, not config""" 45 | try: 46 | response = self.ssm.get_parameter(Name=parameter_name) 47 | return response["Parameter"]["Value"] 48 | except ClientError as e: 49 | if e.response["Error"]["Code"] == "ThrottlingException": 50 | self.logger.error("SSM RATE LIMIT REACHED") 51 | else: 52 | self.logger.error(f"Error getting SSM parameter {parameter_name}: {e}") 53 | raise 54 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import octagon 4 | from datalake_library.commons import init_logger 5 | from datalake_library.octagon import peh 6 | 7 | logger = init_logger(__name__) 8 | team = os.environ["TEAM"] 9 | dataset = os.environ["DATASET"] 10 | pipeline = os.environ["PIPELINE"] 11 | pipeline_stage = os.environ["PIPELINE_STAGE"] 12 | org = os.environ["ORG"] 13 | domain = os.environ["DOMAIN"] 14 | env = os.environ["ENV"] 15 | 16 | 17 | def lambda_handler(event, context): 18 | """Updates the S3 objects metadata catalog 19 | 20 | Arguments: 21 | event {dict} -- Dictionary with details on previous processing step 22 | context {dict} -- Dictionary with details on Lambda context 23 | 24 | Returns: 25 | {dict} -- Dictionary with outcome of the process 26 | """ 27 | try: 28 | logger.info("Initializing Octagon client") 29 | component = context.function_name.split("-")[-2].title() 30 | octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build() 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | for records in event: 36 | for record in records: 37 | if "processed" not in record or not record["processed"]: 38 | partial_failure = True 39 | 40 | if not partial_failure: 41 | octagon_client.update_pipeline_execution( 42 | status="{} {} Processing".format(pipeline_stage, component), component=component 43 | ) 44 | octagon_client.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | octagon_client.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stage-emrserverless/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library import octagon 4 | from datalake_library.commons import init_logger 5 | from datalake_library.octagon import peh 6 | 7 | logger = init_logger(__name__) 8 | team = os.environ["TEAM"] 9 | dataset = os.environ["DATASET"] 10 | pipeline = os.environ["PIPELINE"] 11 | pipeline_stage = os.environ["PIPELINE_STAGE"] 12 | org = os.environ["ORG"] 13 | domain = os.environ["DOMAIN"] 14 | env = os.environ["ENV"] 15 | 16 | 17 | def lambda_handler(event, context): 18 | """Updates the S3 objects metadata catalog 19 | 20 | Arguments: 21 | event {dict} -- Dictionary with details on previous processing step 22 | context {dict} -- Dictionary with details on Lambda context 23 | 24 | Returns: 25 | {dict} -- Dictionary with outcome of the process 26 | """ 27 | try: 28 | logger.info("Initializing Octagon client") 29 | component = context.function_name.split("-")[-2].title() 30 | octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build() 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | for records in event: 36 | for record in records: 37 | if "processed" not in record or not record["processed"]: 38 | partial_failure = True 39 | 40 | if not partial_failure: 41 | octagon_client.update_pipeline_execution( 42 | status="{} {} Processing".format(pipeline_stage, component), component=component 43 | ) 44 | octagon_client.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | octagon_client.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stageA/state-machine/stage-a.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Simple Lambda-based transform", 3 | "StartAt": "Try", 4 | "States": { 5 | "Try": { 6 | "Type": "Parallel", 7 | "Branches": [ 8 | { 9 | "StartAt": "Execute Light Transformation", 10 | "States": { 11 | "Execute Light Transformation": { 12 | "Type": "Task", 13 | "Resource": "arn:aws:states:::lambda:invoke", 14 | "OutputPath": "$.Payload", 15 | "Parameters": { 16 | "Payload.$": "$", 17 | "FunctionName": "${lStep2}:$LATEST" 18 | }, 19 | "Retry": [ 20 | { 21 | "ErrorEquals": [ 22 | "Lambda.ServiceException", 23 | "Lambda.AWSLambdaException", 24 | "Lambda.SdkClientException", 25 | "Lambda.TooManyRequestsException" 26 | ], 27 | "IntervalSeconds": 2, 28 | "MaxAttempts": 6, 29 | "BackoffRate": 2 30 | } 31 | ], 32 | "End": true 33 | } 34 | } 35 | } 36 | ], 37 | "End": true, 38 | "Catch": [ 39 | { 40 | "ErrorEquals": [ 41 | "States.ALL" 42 | ], 43 | "ResultPath": null, 44 | "Next": "Error" 45 | } 46 | ] 47 | }, 48 | "Error": { 49 | "Type": "Task", 50 | "Resource": "arn:aws:states:::lambda:invoke", 51 | "OutputPath": "$.Payload", 52 | "Parameters": { 53 | "Payload.$": "$", 54 | "FunctionName": "${lError}:$LATEST" 55 | }, 56 | "Retry": [ 57 | { 58 | "ErrorEquals": [ 59 | "Lambda.ServiceException", 60 | "Lambda.AWSLambdaException", 61 | "Lambda.SdkClientException", 62 | "Lambda.TooManyRequestsException" 63 | ], 64 | "IntervalSeconds": 2, 65 | "MaxAttempts": 6, 66 | "BackoffRate": 2 67 | } 68 | ], 69 | "Next": "Fail" 70 | }, 71 | "Fail": { 72 | "Type": "Fail" 73 | } 74 | } 75 | } -------------------------------------------------------------------------------- /docs/constructs/stage-glue.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-glue (sdlf-stageB) 2 | 3 | !!! note 4 | `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png) 9 | 10 | Run a Glue job. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainB: 20 | Type: awslabs::sdlf::stageB::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pDatasetBucket: "{{resolve:ssm:/SDLF2/S3/StageBucket}}" 24 | pStageName: B 25 | pPipeline: main 26 | pTeamName: iot 27 | pTriggerType: schedule 28 | pEventPattern: !Sub >- 29 | { 30 | "source": ["aws.states"], 31 | "detail-type": ["Step Functions Execution Status Change"], 32 | "detail": { 33 | "status": ["SUCCEEDED"], 34 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 35 | } 36 | } 37 | pSchedule: "cron(*/5 * * * ? *)" 38 | pEnableTracing: false 39 | ``` 40 | 41 | ## Interface 42 | 43 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters: 44 | 45 | | SSM Parameter | Description | Comment | 46 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 47 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Name of the DynamoDB used to store mappings to transformation | | 48 | -------------------------------------------------------------------------------- /docs/constructs/stage-dataquality.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-dataquality 2 | 3 | !!! note 4 | `sdlf-stage-dataquality` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stage-dataquality) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Data Quality](../_static/sdlf-stage-dataquality.png) 9 | 10 | Create a Glue Data Quality ruleset from recommendations then apply this ruleset to a given Glue table. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainDq: 20 | Type: proserve::iot::dataquality::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pStageName: DQ 24 | pPipeline: main 25 | pTeamName: iot 26 | pTriggerType: event 27 | pEventPattern: !Sub >- 28 | { 29 | "source": ["aws.states"], 30 | "detail-type": ["Step Functions Execution Status Change"], 31 | "detail": { 32 | "status": ["SUCCEEDED"], 33 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-b"] 34 | } 35 | } 36 | pEnableTracing: false 37 | ``` 38 | 39 | ## Interface 40 | 41 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-dataquality` publishes the following parameters: 42 | 43 | | SSM Parameter | Description | Comment | 44 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 45 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Function | | 46 | -------------------------------------------------------------------------------- /sdlf-cicd/template-lambda-layer.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Transform: AWS::LanguageExtensions 3 | Description: Deploy Lambda Layers 4 | 5 | Parameters: 6 | pArtifactsBucket: 7 | Description: The artifacts bucket used by CodeBuild and CodePipeline 8 | Type: String 9 | pDomain: 10 | Description: Name of the data domain (all lowercase, no symbols or spaces) 11 | Type: String 12 | pEnvironment: 13 | Description: Environment name 14 | Type: String 15 | AllowedValues: [dev, test, prod] 16 | pTeamName: 17 | Description: Name of the team (all lowercase, no symbols or spaces) 18 | Type: String 19 | pLayers: 20 | Description: List of folder names from layers/ directory 21 | Type: CommaDelimitedList 22 | AllowedPattern: "^[a-zA-Z0-9]*$" 23 | pGitRef: 24 | Description: Git reference (commit id) with the sources of these layers 25 | Type: String 26 | 27 | Conditions: 28 | DatalakeLibraryLayer: !Equals [!Ref pTeamName, sdlf] 29 | 30 | Resources: 31 | ######## LAMBDA LAYERS ######## 32 | "Fn::ForEach::LambdaLayerResources": 33 | - LayerName 34 | - !Ref pLayers 35 | - "r${LayerName}LambdaLayer": 36 | Type: AWS::Lambda::LayerVersion 37 | Properties: 38 | CompatibleRuntimes: 39 | - python3.12 40 | Content: 41 | S3Bucket: !Ref pArtifactsBucket 42 | S3Key: !Sub layers/${pDomain}/${pEnvironment}/${pTeamName}/${LayerName}-${pGitRef}.zip 43 | Description: !Sub ${pTeamName} ${LayerName} Lambda Layer 44 | LayerName: 45 | !If [ 46 | DatalakeLibraryLayer, 47 | !Sub "sdlf-${LayerName}", 48 | !Sub "sdlf-${pTeamName}-${LayerName}" 49 | ] 50 | "r${LayerName}LambdaLayerSsm": 51 | Type: AWS::SSM::Parameter 52 | Properties: 53 | Name: 54 | !If [ 55 | DatalakeLibraryLayer, 56 | !Sub "/SDLF/Lambda/Latest${LayerName}Layer", 57 | !Sub "/SDLF/Lambda/${pTeamName}/Latest${LayerName}Layer" 58 | ] 59 | Type: String 60 | Value: !Ref 61 | "Fn::Sub": r${LayerName}LambdaLayer 62 | Description: !Sub The ARN of the latest version of the ${pTeamName} ${LayerName} layer 63 | -------------------------------------------------------------------------------- /sdlf-stage-glue/README.md: -------------------------------------------------------------------------------- 1 | # [**DEPRECATED**] 2 | # sdlf-stage-glue (sdlf-stageB) 3 | 4 | !!! note 5 | `sdlf-stage-glue` is defined in the [sdlf-stageB](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageB) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 6 | 7 | ## Infrastructure 8 | 9 | ![SDLF Stage Glue](../_static/sdlf-stage-glue.png) 10 | 11 | Run a Glue job. 12 | 13 | ## Usage 14 | 15 | ### CloudFormation with [sdlf-cicd](cicd.md) 16 | 17 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 18 | 19 | ``` 20 | rMainB: 21 | Type: awslabs::sdlf::stageB::MODULE 22 | Properties: 23 | pPipelineReference: !Ref pPipelineReference 24 | pDatasetBucket: "{{resolve:ssm:/SDLF2/S3/StageBucket}}" 25 | pStageName: B 26 | pPipeline: main 27 | pTeamName: iot 28 | pTriggerType: schedule 29 | pEventPattern: !Sub >- 30 | { 31 | "source": ["aws.states"], 32 | "detail-type": ["Step Functions Execution Status Change"], 33 | "detail": { 34 | "status": ["SUCCEEDED"], 35 | "stateMachineArn": ["arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:sdlf-iot-main-sm-A"] 36 | } 37 | } 38 | pSchedule: "cron(*/5 * * * ? *)" 39 | pEnableTracing: false 40 | ``` 41 | 42 | ## Interface 43 | 44 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-glue` publishes the following parameters: 45 | 46 | | SSM Parameter | Description | Comment | 47 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 48 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Name of the DynamoDB used to store mappings to transformation | | 49 | -------------------------------------------------------------------------------- /sdlf-stage-glue/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI 5 | 6 | logger = init_logger(__name__) 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | """Updates the S3 objects metadata catalog 14 | 15 | Arguments: 16 | event {dict} -- Dictionary with details on previous processing step 17 | context {dict} -- Dictionary with details on Lambda context 18 | 19 | Returns: 20 | {dict} -- Dictionary with outcome of the process 21 | """ 22 | try: 23 | logger.info("Initializing Octagon client") 24 | component = context.function_name.split("-")[-2].title() 25 | pipeline_execution = PipelineExecutionHistoryAPI( 26 | run_in_context="LAMBDA", 27 | region=os.getenv("AWS_REGION"), 28 | peh_table_instance=peh_table_instance, 29 | manifests_table_instance=manifests_table_instance, 30 | ) 31 | peh_id = event[0]["Items"][0]["transform"]["peh_id"] 32 | pipeline_execution.retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | # for records in event: 36 | # for record in records: 37 | # if "processed" not in record or not record["processed"]: 38 | # partial_failure = True 39 | 40 | if not partial_failure: 41 | pipeline_execution.update_pipeline_execution( 42 | status=f"{deployment_instance} {component} Processing", component=component 43 | ) 44 | pipeline_execution.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | pipeline_execution.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from awsglue.context import GlueContext 4 | from awsglue.job import Job 5 | from awsglue.transforms import Join 6 | from awsglue.utils import getResolvedOptions 7 | from pyspark.context import SparkContext 8 | 9 | args = getResolvedOptions(sys.argv, ["JOB_NAME", "SOURCE_LOCATION", "OUTPUT_LOCATION"]) 10 | source = args["SOURCE_LOCATION"] 11 | destination = args["OUTPUT_LOCATION"] 12 | 13 | glueContext = GlueContext(SparkContext.getOrCreate()) 14 | job = Job(glueContext) 15 | job.init(args["JOB_NAME"], args) 16 | 17 | persons = glueContext.create_dynamic_frame.from_options( 18 | connection_type="s3", 19 | format="json", 20 | connection_options={"paths": ["{}/{}".format(source, "persons_parsed.json")]}, 21 | format_options={"withHeader": False}, 22 | transformation_ctx="path={}".format("persons_df"), 23 | ) 24 | 25 | memberships = glueContext.create_dynamic_frame.from_options( 26 | connection_type="s3", 27 | format="json", 28 | connection_options={"paths": ["{}/{}".format(source, "memberships_parsed.json")]}, 29 | format_options={"withHeader": False}, 30 | transformation_ctx="path={}".format("memberships_df"), 31 | ) 32 | 33 | organizations = ( 34 | glueContext.create_dynamic_frame.from_options( 35 | connection_type="s3", 36 | format="json", 37 | connection_options={"paths": ["{}/{}".format(source, "organizations_parsed.json")]}, 38 | format_options={"withHeader": False}, 39 | transformation_ctx="path={}".format("organizations_df"), 40 | ) 41 | .rename_field("id", "org_id") 42 | .rename_field("name", "org_name") 43 | ) 44 | 45 | history = Join.apply( 46 | organizations, Join.apply(persons, memberships, "id", "person_id"), "org_id", "organization_id" 47 | ).drop_fields(["person_id", "org_id"]) 48 | 49 | persons.toDF().write.mode("overwrite").parquet("{}/persons/".format(destination)) 50 | organizations.toDF().write.mode("overwrite").parquet("{}/organizations/".format(destination)) 51 | memberships.toDF().write.mode("overwrite").parquet("{}//memberships/".format(destination)) 52 | history.toDF().write.mode("overwrite").parquet("{}/history/".format(destination), partitionBy=["org_name"]) 53 | 54 | job.commit() 55 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/postupdate-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datalake_library.commons import init_logger 4 | from datalake_library.sdlf import PipelineExecutionHistoryAPI 5 | 6 | logger = init_logger(__name__) 7 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 8 | peh_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 9 | manifests_table_instance = os.environ["DATASET_DEPLOYMENT_INSTANCE"] 10 | 11 | 12 | def lambda_handler(event, context): 13 | """Updates the S3 objects metadata catalog 14 | 15 | Arguments: 16 | event {dict} -- Dictionary with details on previous processing step 17 | context {dict} -- Dictionary with details on Lambda context 18 | 19 | Returns: 20 | {dict} -- Dictionary with outcome of the process 21 | """ 22 | try: 23 | logger.info("Initializing Octagon client") 24 | component = context.function_name.split("-")[-2].title() 25 | pipeline_execution = PipelineExecutionHistoryAPI( 26 | run_in_context="LAMBDA", 27 | region=os.getenv("AWS_REGION"), 28 | peh_table_instance=peh_table_instance, 29 | manifests_table_instance=manifests_table_instance, 30 | ) 31 | peh_id = event[0]["run_output"][0]["transform"]["peh_id"] 32 | pipeline_execution.retrieve_pipeline_execution(peh_id) 33 | 34 | partial_failure = False 35 | # for records in event: 36 | # for record in records: 37 | # if "processed" not in record or not record["processed"]: 38 | # partial_failure = True 39 | 40 | if not partial_failure: 41 | pipeline_execution.update_pipeline_execution( 42 | status=f"{deployment_instance} {component} Processing", component=component 43 | ) 44 | pipeline_execution.end_pipeline_execution_success() 45 | else: 46 | raise Exception("Failure: Processing failed for one or more record") 47 | 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | pipeline_execution.end_pipeline_execution_failed( 51 | component=component, issue_comment=f"{deployment_instance} {component} Error: {repr(e)}" 52 | ) 53 | raise e 54 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-fetch-metadata/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | from datalake_library.commons import init_logger 2 | 3 | logger = init_logger(__name__) 4 | 5 | 6 | def get_glue_transform_details(bucket, team, dataset, pipeline, stage): 7 | # Default Glue job configuration 8 | job_name = f"sdlf-{team}-{dataset}-glue-job" # Name of the Glue Job 9 | glue_capacity = {"WorkerType": "G.1X", "NumberOfWorkers": 10} 10 | wait_time = 60 11 | glue_arguments = { 12 | # Specify any arguments needed based on bucket and keys (e.g. input/output S3 locations) 13 | "--SOURCE_LOCATION": f"s3://{bucket}/pre-stage/{team}/{dataset}", 14 | "--OUTPUT_LOCATION": f"s3://{bucket}/post-stage/{team}/{dataset}", 15 | "--job-bookmark-option": "job-bookmark-enable", 16 | } 17 | 18 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 19 | logger.info(f"Using default Glue job configuration: {job_name}") 20 | 21 | return {"job_name": job_name, "wait_time": wait_time, "arguments": glue_arguments, **glue_capacity} 22 | 23 | 24 | def lambda_handler(event, context): 25 | """Calls custom transform developed by user 26 | 27 | Arguments: 28 | event {dict} -- Dictionary with details on previous processing step 29 | context {dict} -- Dictionary with details on Lambda context 30 | 31 | Returns: 32 | {dict} -- Dictionary with Processed Bucket and Key(s) 33 | """ 34 | try: 35 | logger.info("Fetching event data from previous step") 36 | bucket = event["body"]["bucket"] 37 | team = event["body"]["team"] 38 | pipeline = event["body"]["pipeline"] 39 | stage = event["body"]["pipeline_stage"] 40 | dataset = event["body"]["dataset"] 41 | 42 | # Call custom transform created by user and process the file 43 | logger.info("Calling user custom processing code") 44 | event["body"]["glue"] = get_glue_transform_details(bucket, team, dataset, pipeline, stage) 45 | event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"]) 46 | 47 | logger.info("Successfully prepared Glue job configuration") 48 | except Exception as e: 49 | logger.error("Fatal error", exc_info=True) 50 | raise e 51 | return event 52 | -------------------------------------------------------------------------------- /docs/constructs/stage-lambda.md: -------------------------------------------------------------------------------- 1 | # sdlf-stage-lambda (sdlf-stageA) 2 | 3 | !!! note 4 | `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png) 9 | 10 | Run a Lambda function. 11 | 12 | ## Usage 13 | 14 | ### CloudFormation with [sdlf-cicd](cicd.md) 15 | 16 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 17 | 18 | ``` 19 | rMainA: 20 | Type: awslabs::sdlf::stageA::MODULE 21 | Properties: 22 | pPipelineReference: !Ref pPipelineReference 23 | pStageName: A 24 | pPipeline: main 25 | pTeamName: iot 26 | pTriggerType: event 27 | pEventPattern: >- 28 | { 29 | "source": ["aws.s3"], 30 | "detail-type": ["Object Created"], 31 | "detail": { 32 | "bucket": { 33 | "name": ["{{resolve:ssm:/SDLF2/S3/RawBucket}}"] 34 | }, 35 | "object": { 36 | "key": [{ "prefix": "iot/legislators/" }] 37 | } 38 | } 39 | } 40 | pEnableTracing: false 41 | ``` 42 | 43 | ## Interface 44 | 45 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters: 46 | 47 | | SSM Parameter | Description | Comment | 48 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 49 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda | | 50 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Functions | | 51 | -------------------------------------------------------------------------------- /sdlf-stage-lambda/README.md: -------------------------------------------------------------------------------- 1 | # [**DEPRECATED**] 2 | # sdlf-stage-lambda (sdlf-stageA) 3 | 4 | !!! note 5 | `sdlf-stage-lambda` is defined in the [sdlf-stageA](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-stageA) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 6 | 7 | ## Infrastructure 8 | 9 | ![SDLF Stage Lambda](../_static/sdlf-stage-lambda.png) 10 | 11 | Run a Lambda function. 12 | 13 | ## Usage 14 | 15 | ### CloudFormation with [sdlf-cicd](cicd.md) 16 | 17 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 18 | 19 | ``` 20 | rMainA: 21 | Type: awslabs::sdlf::stageA::MODULE 22 | Properties: 23 | pPipelineReference: !Ref pPipelineReference 24 | pStageName: A 25 | pPipeline: main 26 | pTeamName: iot 27 | pTriggerType: event 28 | pEventPattern: >- 29 | { 30 | "source": ["aws.s3"], 31 | "detail-type": ["Object Created"], 32 | "detail": { 33 | "bucket": { 34 | "name": ["{{resolve:ssm:/SDLF2/S3/RawBucket}}"] 35 | }, 36 | "object": { 37 | "key": [{ "prefix": "iot/legislators/" }] 38 | } 39 | } 40 | } 41 | pEnableTracing: false 42 | ``` 43 | 44 | ## Interface 45 | 46 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-stage-lambda` publishes the following parameters: 47 | 48 | | SSM Parameter | Description | Comment | 49 | | ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------- | 50 | | `/SDLF/Lambda/{team}/{pipeline}{stage}RoutingLambda` | Routing Lambda | | 51 | | `/SDLF/SM/{team}/{pipeline}{stage}SM` | Step Functions | | 52 | -------------------------------------------------------------------------------- /sdlf-cicd/template-cicd-team-repository.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: CICD resources to handle deployment of a new team (repository) 3 | 4 | Parameters: 5 | pKMSKey: 6 | Description: The KMS key used by CodeBuild and CodePipeline 7 | Type: AWS::SSM::Parameter::Value 8 | Default: /SDLF/KMS/CICDKeyId 9 | pDomain: 10 | Description: Name of the data domain (all lowercase, no symbols or spaces) 11 | Type: String 12 | AllowedPattern: "[a-z0-9]{2,9}" 13 | pTeamName: 14 | Description: Team name 15 | Type: String 16 | pGitPlatform: 17 | Description: Platform used to host git repositories 18 | Type: AWS::SSM::Parameter::Value 19 | Default: /SDLF/Misc/GitPlatform 20 | pMainRepositoriesPrefix: 21 | Type: String 22 | Default: sdlf-main- 23 | 24 | Conditions: 25 | GitPlatformCodeCommit: !Equals [!Ref pGitPlatform, "CodeCommit"] 26 | GitPlatformGitHub: !Equals [!Ref pGitPlatform, "GitHub"] 27 | 28 | Resources: 29 | rTeamMainCodeCommit: 30 | Type: AWS::CodeCommit::Repository 31 | Condition: GitPlatformCodeCommit 32 | Metadata: 33 | cfn-lint: 34 | config: 35 | ignore_checks: 36 | - E3002 37 | Properties: 38 | Code: 39 | BranchName: main 40 | S3: ./README.md 41 | RepositoryDescription: !Sub ${pDomain} ${pTeamName} main repository 42 | RepositoryName: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 43 | KmsKeyId: !Ref pKMSKey 44 | 45 | rTeamMainGitHub: 46 | Type: GitHub::Repositories::Repository 47 | Metadata: 48 | cfn-lint: 49 | config: 50 | ignore_checks: 51 | - E3001 52 | Condition: GitPlatformGitHub 53 | Properties: 54 | Org: !Sub "{{resolve:ssm:/SDLF/${pGitPlatform}/Group}}" 55 | Name: !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 56 | Private: true 57 | Visibility: private 58 | Archived: false 59 | 60 | rTeamMainCodeCommitSsm: 61 | Type: AWS::SSM::Parameter 62 | Properties: 63 | Name: !Sub /SDLF/${pGitPlatform}/${pTeamName}/Main${pGitPlatform} 64 | Type: String 65 | Value: !If 66 | - GitPlatformCodeCommit 67 | - !GetAtt rTeamMainCodeCommit.Name 68 | - !Sub ${pMainRepositoriesPrefix}${pDomain}-${pTeamName} 69 | Description: !Sub Name of the ${pDomain} ${pTeamName} main repository 70 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/s3_interface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from urllib.parse import unquote_plus 4 | 5 | from .base_interface import BaseInterface 6 | 7 | 8 | class S3Interface(BaseInterface): 9 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 10 | super().__init__(team, dataset, pipeline, stage, log_level, session) 11 | 12 | def _initialize_client(self): 13 | # S3 specific client 14 | self.s3 = self.session.client("s3", config=self.session_config) 15 | 16 | def _load_config(self): 17 | """Load S3-specific configuration from SSM""" 18 | self.raw_bucket = self._get_ssm_parameter("/SDLF2/S3/RawBucket") 19 | self.stage_bucket = self._get_ssm_parameter("/SDLF2/S3/StageBucket") 20 | self.analytics_bucket = self._get_ssm_parameter("/SDLF2/S3/AnalyticsBucket").split(":")[-1] 21 | self.artifacts_bucket = self._get_ssm_parameter("/SDLF2/S3/ArtifactsBucket") 22 | 23 | def download_object(self, bucket, key): 24 | dir_path = f"/tmp/{bucket}/" 25 | if os.path.exists(dir_path): 26 | shutil.rmtree(dir_path, ignore_errors=True) 27 | os.makedirs(dir_path) 28 | 29 | object_path = dir_path + key.split("/")[-1] 30 | key = unquote_plus(key) 31 | self.s3.download_file(bucket, key, object_path) 32 | return object_path 33 | 34 | def upload_object(self, object_path, bucket, key, kms_key=None): 35 | extra_kwargs = {} 36 | if kms_key: 37 | extra_kwargs = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": kms_key} 38 | self.s3.upload_file(object_path, bucket, key, ExtraArgs=extra_kwargs) 39 | 40 | def copy_object(self, source_bucket, source_key, dest_bucket, dest_key=None, kms_key=None): 41 | source_key = unquote_plus(source_key) 42 | dest_key = dest_key or source_key 43 | copy_source = {"Bucket": source_bucket, "Key": source_key} 44 | extra_kwargs = {} 45 | if kms_key: 46 | extra_kwargs = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": kms_key} 47 | self.s3.copy_object(CopySource=copy_source, Bucket=dest_bucket, Key=dest_key, **extra_kwargs) 48 | 49 | def get_size_and_last_modified(self, bucket, key): 50 | object_metadata = self.s3.head_object(Bucket=bucket, Key=key) 51 | return (object_metadata["ContentLength"], object_metadata["LastModified"].isoformat()) 52 | -------------------------------------------------------------------------------- /.github/workflows/static-checking.yml: -------------------------------------------------------------------------------- 1 | name: Static Checking 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | cfn: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 3.12 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.12 20 | - name: Set up Ruby 3.2 21 | uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: 3.2 24 | - name: install requirements 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install "cfn-lint<1" 28 | gem install cfn-nag 29 | - name: cfn-lint 30 | run: | 31 | shopt -s globstar 32 | cfn-lint ./**/*.yaml 33 | - name: cfn-nag 34 | run: | 35 | cat <> .cfn-nag-deny-list.yml 36 | - id: W61 37 | reason: |- 38 | Certificates are handled by customers downstream, see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-encryption-enable.html#emr-encryption-certificates 39 | This is ignored only during CI as we want customers to be aware they need to update the security configuration should they choose to use it. 40 | EOT 41 | find . -not \( -type f -name 'template-glue-job.yaml' -o -type f -name 'template-lambda-layer.yaml' \) -type f -name '*.yaml' -print0 \ 42 | | xargs -0 -L 1 cfn_nag_scan --fail-on-warnings --ignore-fatal --deny-list-path .cfn-nag-deny-list.yml --input-path 43 | python: 44 | runs-on: ubuntu-latest 45 | steps: 46 | - uses: actions/checkout@v4 47 | - name: Set up Python 3.12 48 | uses: actions/setup-python@v5 49 | with: 50 | python-version: 3.12 51 | - name: install requirements 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install ruff 55 | - name: ruff format 56 | run: ruff format --check . 57 | - name: ruff 58 | run: ruff check --output-format github . 59 | shellcheck: 60 | runs-on: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v4 63 | - name: install requirements 64 | run: | 65 | sudo apt update 66 | sudo apt install shellcheck 67 | - name: shellcheck 68 | run: | 69 | find . -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' \) -print0 \ 70 | | xargs -0 shellcheck -x --format gcc 71 | -------------------------------------------------------------------------------- /sdlf-monitoring/kibana/generic_dashboard.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "e4069440-fb2c-11e9-86fb-19c7ff919e3c", 4 | "_type": "dashboard", 5 | "_source": { 6 | "title": "Generic Dashboard", 7 | "hits": 0, 8 | "description": "", 9 | "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15,\"i\":\"1\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"dec27710-fb2c-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":15,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#E24D42\"},\"legendOpen\":false}},\"id\":\"26819f40-fb2d-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":42,\"w\":48,\"h\":14,\"i\":\"5\"},\"embeddableConfig\":{\"vis\":{\"params\":{\"sort\":{\"columnIndex\":2,\"direction\":\"asc\"}}}},\"id\":\"60903910-fb2f-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":0,\"y\":15,\"w\":13,\"h\":12,\"i\":\"8\"},\"embeddableConfig\":{},\"id\":\"3c879b80-fbc5-11e9-86fb-19c7ff919e3c\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"9\",\"gridData\":{\"x\":37,\"y\":15,\"w\":11,\"h\":12,\"i\":\"9\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"/aws/lambda/f1-dl04-dev-activemq-process-a\":\"#EA6460\"},\"legendOpen\":false}},\"id\":\"945eca40-fbc5-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"12\",\"gridData\":{\"x\":13,\"y\":15,\"w\":24,\"h\":12,\"i\":\"12\"},\"embeddableConfig\":{\"vis\":{\"colors\":{\"Count\":\"#5195CE\"},\"legendOpen\":false}},\"id\":\"131befe0-fbd3-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"13\",\"gridData\":{\"x\":0,\"y\":27,\"w\":24,\"h\":15,\"i\":\"13\"},\"embeddableConfig\":{},\"id\":\"20f6a2e0-fbce-11e9-b09b-8b893ba5891b\",\"type\":\"visualization\",\"version\":\"6.3.1\"},{\"panelIndex\":\"15\",\"gridData\":{\"x\":24,\"y\":27,\"w\":24,\"h\":15,\"i\":\"15\"},\"version\":\"6.3.1\",\"type\":\"visualization\",\"id\":\"6506b6d0-1048-11ea-a53e-f38a7f594614\",\"embeddableConfig\":{}}]", 10 | "optionsJSON": "{\"darkTheme\":false,\"hidePanelTitles\":false,\"useMargins\":true}", 11 | "version": 1, 12 | "timeRestore": false, 13 | "kibanaSavedObjectMeta": { 14 | "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}" 15 | } 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/10-demo/sdlf-workshop/pipeline-main.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Description: Main pipeline 3 | 4 | Parameters: 5 | pPipelineReference: 6 | Type: String 7 | Default: none 8 | 9 | Resources: 10 | rMainA: 11 | Type: AWS::CloudFormation::Stack 12 | Properties: 13 | TemplateURL: "{{resolve:ssm:/sdlf/stagelambda/main}}" 14 | Parameters: 15 | pPipelineReference: !Ref pPipelineReference 16 | pDeploymentInstance: mainA 17 | pStorageDeploymentInstance: dev 18 | pDatasetDeploymentInstance: dev 19 | pTriggerType: event 20 | pEventPattern: >- 21 | { 22 | "source": ["aws.s3"], 23 | "detail-type": ["Object Created"], 24 | "detail": { 25 | "bucket": { 26 | "name": ["{{resolve:ssm:/sdlf/storage/rRawBucket/dev}}"] 27 | }, 28 | "object": { 29 | "key": [{ "prefix": "legislators/" }] 30 | } 31 | } 32 | } 33 | pEnableTracing: false 34 | 35 | rMainB: 36 | Type: AWS::CloudFormation::Stack 37 | Properties: 38 | TemplateURL: "{{resolve:ssm:/sdlf/stageglue/main}}" 39 | Parameters: 40 | pPipelineReference: !Ref pPipelineReference 41 | pDeploymentInstance: mainB 42 | pStorageDeploymentInstance: dev 43 | pDatasetDeploymentInstance: dev 44 | pGlueJobName: sdlf-mainB-glue-job 45 | pGlueNumberOfWorkers: 10 46 | pGlueWorkerType: G.1X 47 | pTriggerType: schedule 48 | pEventPattern: >- 49 | { 50 | "source": ["aws.s3"], 51 | "detail-type": ["Object Created"], 52 | "detail": { 53 | "bucket": { 54 | "name": ["{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}"] 55 | }, 56 | "object": { 57 | "key": [{ "prefix": "legislators/mainA/" }] 58 | } 59 | } 60 | } 61 | pSchedule: "cron(*/5 * * * ? *)" 62 | pEnableTracing: false 63 | pGlueArguments: >- 64 | { 65 | "--job-bookmark-option": "job-bookmark-enable", 66 | "--enable-metrics": "", 67 | "--enable-auto-scaling": "true", 68 | "--SOURCE_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rStageBucket/dev}}/legislators/mainA", 69 | "--OUTPUT_LOCATION": !Sub "s3://{{resolve:ssm:/sdlf/storage/rAnalyticsBucket/dev}}/legislators/mainB" 70 | } 71 | -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/stage-routing/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library.commons import init_logger 5 | from datalake_library.configuration.resource_configs import ( 6 | S3Configuration, 7 | SQSConfiguration, 8 | StateMachineConfiguration, 9 | ) 10 | from datalake_library.interfaces.sqs_interface import SQSInterface 11 | from datalake_library.interfaces.states_interface import StatesInterface 12 | 13 | logger = init_logger(__name__) 14 | 15 | 16 | def lambda_handler(event, context): 17 | """Checks if any items need processing and triggers state machine 18 | Arguments: 19 | event {dict} -- Dictionary with details on what needs processing 20 | context {dict} -- Dictionary with details on Lambda context 21 | """ 22 | 23 | try: 24 | records = event["Records"] 25 | logger.info(f"Received {len(records)} messages") 26 | response = {} 27 | for record in records: 28 | event_body = json.loads(json.loads(record["body"])["output"])[0]["body"] 29 | logger.info(event_body) 30 | team = event_body["team"] 31 | pipeline = event_body["pipeline"] 32 | stage = os.environ["PIPELINE_STAGE"] 33 | dataset = event_body["dataset"] 34 | org = event_body["org"] 35 | domain = event_body["domain"] 36 | env = event_body["env"] 37 | stage_bucket = S3Configuration().stage_bucket 38 | 39 | response = { 40 | "statusCode": 200, 41 | "body": { 42 | "bucket": stage_bucket, 43 | "team": team, 44 | "pipeline": pipeline, 45 | "pipeline_stage": stage, 46 | "dataset": dataset, 47 | "org": org, 48 | "domain": domain, 49 | "env": env, 50 | }, 51 | } 52 | if response: 53 | logger.info("Starting State Machine Execution") 54 | state_config = StateMachineConfiguration(team, pipeline, stage) 55 | StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, response) 56 | except Exception as e: 57 | # If failure send to DLQ 58 | sqs_config = SQSConfiguration(team, pipeline, stage) 59 | dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) 60 | dlq_interface.send_message_to_fifo_queue(json.dumps(response), "failed") 61 | logger.error("Fatal error", exc_info=True) 62 | raise e 63 | -------------------------------------------------------------------------------- /docs/constructs/dataset.md: -------------------------------------------------------------------------------- 1 | # sdlf-dataset 2 | 3 | !!! note 4 | `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Dataset](../_static/sdlf-dataset.png) 9 | 10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset. 11 | 12 | Examples of datasets are: 13 | 14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables) 15 | - A group of files from a data source (e.g. XML files from a Telemetry system) 16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3) 17 | 18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler. 19 | 20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 21 | 22 | ## Usage 23 | 24 | ### CloudFormation with [sdlf-cicd](cicd.md) 25 | 26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 27 | 28 | ``` 29 | rExample: 30 | Type: awslabs::sdlf::dataset::MODULE 31 | Properties: 32 | pPipelineReference: !Ref pPipelineReference 33 | pTeamName: iot 34 | pDatasetName: legislators 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Datasets/{team}/{dataset}` | Dataset-specific metadata for data pipelines | | 44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler | | 45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog" | | 46 | -------------------------------------------------------------------------------- /sdlf-dataset/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-dataset 2 | 3 | !!! note 4 | `sdlf-dataset` is defined in the [sdlf-dataset](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-dataset) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Dataset](../_static/sdlf-dataset.png) 9 | 10 | A SDLF dataset is a logical construct referring to a grouping of data. It can be anything from a single table to an entire database with multiple tables for example. However, an overall good practice is to limit the infrastructure deployed to the minimum to avoid unnecessary overhead and cost. It means that in general, the more data is grouped together the better. Abstraction at the transformation code level can then help make distinctions within a given dataset. 11 | 12 | Examples of datasets are: 13 | 14 | - A relational database with multiple tables (e.g. Sales DB with orders and customers tables) 15 | - A group of files from a data source (e.g. XML files from a Telemetry system) 16 | - A streaming data source (e.g. Kinesis data stream batching files and dumping them into S3) 17 | 18 | `sdlf-dataset` creates a Glue database, as well as a Glue crawler. 19 | 20 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 21 | 22 | ## Usage 23 | 24 | ### CloudFormation with [sdlf-cicd](cicd.md) 25 | 26 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 27 | 28 | ``` 29 | rExample: 30 | Type: awslabs::sdlf::dataset::MODULE 31 | Properties: 32 | pPipelineReference: !Ref pPipelineReference 33 | pTeamName: iot 34 | pDatasetName: legislators 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-dataset` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ----------------------------------------- | -------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Datasets/{team}/{dataset}` | Dataset-specific metadata for data pipelines | | 44 | | `/SDLF/Glue/{team}/{dataset}/GlueCrawler` | Team dataset Glue crawler | | 45 | | `/SDLF/Glue/{team}/{dataset}/DataCatalog` | Team dataset metadata catalog" | | 46 | -------------------------------------------------------------------------------- /sdlf-foundations/src/lambda/catalog/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from datetime import UTC, datetime 5 | from urllib.parse import unquote_plus 6 | 7 | import boto3 8 | from botocore.config import Config 9 | from botocore.exceptions import ClientError 10 | 11 | session_config = Config(user_agent_extra="awssdlf/2.11.0") 12 | 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) 15 | dynamodb = boto3.client("dynamodb", config=session_config) 16 | catalog_table = os.getenv("OBJECTMETADATA_TABLE") 17 | 18 | 19 | def parse_s3_event(s3_event): 20 | return { 21 | "bucket": {"S": s3_event["detail"]["bucket"]["name"]}, 22 | "key": {"S": unquote_plus(s3_event["detail"]["object"]["key"])}, 23 | "size": {"N": str(s3_event["detail"]["object"]["size"])}, 24 | "last_modified_date": {"S": s3_event["time"]}, 25 | "timestamp": {"N": str(int(round(datetime.now(UTC).timestamp() * 1000, 0)))}, 26 | } 27 | 28 | 29 | def put_item(table, item, key): 30 | try: 31 | response = dynamodb.put_item( 32 | TableName=table, 33 | Item=item, 34 | ConditionExpression=f"attribute_not_exists({key})", 35 | ) 36 | except ClientError as e: 37 | if e.response["Error"]["Code"] == "ConditionalCheckFailedException": 38 | logger.info(e.response["Error"]["Message"]) 39 | else: 40 | raise 41 | else: 42 | return response 43 | 44 | 45 | def delete_item(table, key): 46 | try: 47 | response = dynamodb.delete_item(TableName=table, Key=key) 48 | except ClientError as e: 49 | logger.error("Fatal error", exc_info=True) 50 | raise e 51 | else: 52 | return response 53 | 54 | 55 | def lambda_handler(event, context): 56 | try: 57 | logger.info(f"Received {len(event['Records'])} messages") 58 | for record in event["Records"]: 59 | logger.info("Parsing S3 Event") 60 | message = json.loads(record["body"]) 61 | operation = message["detail-type"] 62 | bucket = message["detail"]["bucket"]["name"] 63 | key = unquote_plus(message["detail"]["object"]["key"]) 64 | id = f"s3://{bucket}/{key}" 65 | 66 | logger.info(f"Performing Dynamo {operation} operation") 67 | if operation in ["Object Deleted"]: 68 | delete_item(catalog_table, {"id": id}) 69 | else: 70 | item = parse_s3_event(message) 71 | item["id"] = {"S": id} 72 | item["stage"] = {"S": bucket.split("-")[-1]} 73 | put_item(catalog_table, item, "id") 74 | except Exception as e: 75 | logger.error("Fatal error", exc_info=True) 76 | raise e 77 | -------------------------------------------------------------------------------- /sdlf-team/src/lambda/datasets-dynamodb/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import boto3 6 | from boto3.dynamodb.types import TypeSerializer 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(logging.INFO) 10 | 11 | dynamodb = boto3.client("dynamodb") 12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 14 | 15 | 16 | def delete_dynamodb_dataset_entry(table_name, team_name, dataset_name): 17 | response = dynamodb.delete_item( 18 | TableName=table_name, 19 | Key={"name": {"S": f"{team_name}-{dataset_name}"}}, 20 | ) 21 | return response 22 | 23 | 24 | def create_dynamodb_dataset_entry(table_name, team_name, dataset_name, pipeline_details): 25 | pipeline_details_dynamodb_json = TypeSerializer().serialize(pipeline_details) 26 | logger.info("PIPELINE DETAILS DYNAMODB JSON: %s", pipeline_details_dynamodb_json) 27 | response = dynamodb.update_item( 28 | TableName=table_name, 29 | Key={"name": {"S": f"{team_name}-{dataset_name}"}}, 30 | ExpressionAttributeNames={ 31 | "#P": "pipeline", 32 | "#V": "version", 33 | }, 34 | ExpressionAttributeValues={ 35 | ":p": pipeline_details_dynamodb_json, 36 | ":v": {"N": "1"}, 37 | }, 38 | UpdateExpression="SET #P = :p, #V = :v", 39 | ReturnValues="UPDATED_NEW", 40 | ) 41 | return response 42 | 43 | 44 | def lambda_handler(event, context): 45 | try: 46 | environment = os.getenv("ENVIRONMENT") 47 | team_name = os.getenv("TEAM_NAME") 48 | custom_octagon_suffix = os.getenv("CUSTOM_OCTAGON_SUFFIX", "") 49 | table = f"octagon-Datasets-{environment}{custom_octagon_suffix}" 50 | 51 | paginator = ssm.get_paginator("get_parameters_by_path") 52 | datasets_pages = paginator.paginate(Path=f"/SDLF/Datasets/{team_name}") 53 | 54 | for datasets_page in datasets_pages: 55 | for dataset in datasets_page["Parameters"]: 56 | dataset_name = dataset["Name"].split("/")[-1] 57 | logger.info("DATASET SSM CONTENT: %s", dataset["Value"]) 58 | dataset_pipeline_details = json.loads(dataset["Value"]) 59 | create_dynamodb_dataset_entry(table, team_name, dataset_name, dataset_pipeline_details) 60 | logger.info(f"{team_name}-{dataset_name} DynamoDB Dataset entry created") 61 | 62 | logger.info("INFO: Entries for datasets that no longer exist are not removed from DynamoDB") 63 | except Exception as e: 64 | message = "Function exception: " + str(e) 65 | logger.error(message, exc_info=True) 66 | raise 67 | 68 | return "Success" 69 | -------------------------------------------------------------------------------- /sdlf-team/src/lambda/pipelines-dynamodb/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | logger = logging.getLogger() 7 | logger.setLevel(logging.INFO) 8 | 9 | dynamodb = boto3.client("dynamodb") 10 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 11 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 12 | 13 | 14 | def delete_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name): 15 | response = dynamodb.delete_item( 16 | TableName=table_name, 17 | Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}}, 18 | ) 19 | return response 20 | 21 | 22 | def create_dynamodb_pipeline_entry(table_name, team_name, pipeline_name, stage_name): 23 | response = dynamodb.update_item( 24 | TableName=table_name, 25 | Key={"name": {"S": f"{team_name}-{pipeline_name}-{stage_name}"}}, 26 | ExpressionAttributeNames={ 27 | "#T": "type", 28 | "#S": "status", 29 | "#P": "pipeline", 30 | "#V": "version", 31 | }, 32 | ExpressionAttributeValues={ 33 | ":t": { 34 | "S": "TRANSFORMATION", 35 | }, 36 | ":s": {"S": "ACTIVE"}, 37 | ":p": {"M": {"max_items_process": {"N": "100"}, "min_items_process": {"N": "1"}}}, 38 | ":v": {"N": "1"}, 39 | }, 40 | UpdateExpression="SET #T = :t, #S = :s, #P = :p, #V = :v", 41 | ReturnValues="UPDATED_NEW", 42 | ) 43 | return response 44 | 45 | 46 | def lambda_handler(event, context): 47 | try: 48 | environment = os.getenv("ENVIRONMENT") 49 | team_name = os.getenv("TEAM_NAME") 50 | custom_octagon_suffix = os.getenv("CUSTOM_OCTAGON_SUFFIX", "") 51 | table = f"octagon-Pipelines-{environment}{custom_octagon_suffix}" 52 | 53 | paginator = ssm.get_paginator("get_parameters_by_path") 54 | stages_pages = paginator.paginate( 55 | Path=f"/SDLF/Pipelines/{team_name}", 56 | Recursive=True, 57 | ) 58 | for stages_page in stages_pages: 59 | for stage in stages_page["Parameters"]: 60 | pipeline_name = stage["Name"].split("/")[-2] 61 | stage_name = stage["Name"].split("/")[-1] 62 | create_dynamodb_pipeline_entry(table, team_name, pipeline_name, stage_name) 63 | logger.info(f"{team_name}-{pipeline_name}-{stage_name} DynamoDB Pipeline entry created") 64 | 65 | logger.info("INFO: Entries for stages that no longer exist are *not* removed from DynamoDB") 66 | except Exception as e: 67 | message = "Function exception: " + str(e) 68 | logger.error(message, exc_info=True) 69 | raise 70 | 71 | return "Success" 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Editors 2 | .vscode/ 3 | *.code-workspace 4 | .idea/ 5 | .devcontainer/ 6 | 7 | # Mac/OSX 8 | .DS_Store 9 | 10 | # Windows 11 | Thumbs.db 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Misc 22 | automated-deployment/ 23 | rpdk.log 24 | 25 | # Distribution / packaging 26 | output/ 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # cdk 158 | cdk.out 159 | cdk.context.json 160 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # AWS Serverless Data Lake Framework 2 | 3 | Serverless Data Lake Framework (SDLF) is a collection of reusable artifacts aimed at accelerating the delivery of enterprise data lakes on AWS, shortening the deployment time to production from several months to a few weeks. It can be used by AWS teams, partners and customers to implement the foundational structure of a data lake following best practices. 4 | 5 | ## Motivation 6 | 7 | A data lake gives your organization agility. It provides a repository where consumers can quickly find the data they need and use it in their business projects. However, building a data lake can be complex; there's a lot to think about beyond the storage of files. For example, how do you catalog the data so you know what you've stored? What ingestion pipelines do you need? How do you manage data quality? How do you keep the code for your transformations under source control? How do you manage development, test and production environments? Building a solution that addresses these use cases can take many weeks and this time can be better spent innovating with data and achieving business goals. 8 | 9 | SDLF is a collection of production-hardened, best-practices templates which accelerate your data lake implementation journey on AWS, so that you can focus on use cases that generate value for business. 10 | 11 | ## Major Features 12 | 13 | At a high level, SDLF is an infrastructure-as-code framework that enables customers to create: 14 | 15 | - End-to-end data architectures such as a centralized (transactional) data lake or a data mesh 16 | - Foundational data lake assets (e.g. Amazon S3 buckets for data storage) 17 | - Event-driven jobs that orchestrate the transformation of data, storing the output in a new location on S3 18 | - Data processing stages using AWS serverless services such as Lambda or Glue 19 | - Git-driven deployment pipelines (CICD) for the entire data infrastructure 20 | 21 | Using all SDLF features as illustrated in the [official workshop](https://sdlf.workshop.aws/) gives you: 22 | 23 | 1. **Traceability and version control**: 24 | - SDLF is entirely managed through CICD pipelines. At no point is interaction with the AWS console necessary (in fact it's discouraged). 25 | - Using version control ensures that any change to the data lake is scrutinized before it enters production. 26 | 27 | 2. **Scalability and reproducibility**: 28 | - Deploying and tearing down a customized, production-grade data lake can be done in minutes and across multiple accounts and environments. 29 | - This is in comparison to a manual approach which would be tedious, slow, prone to errors and unable to scale. 30 | 31 | 3. **Best practices**: 32 | - Best practices acquired through dozens of implementations in production are enforced in the framework. 33 | - Features such as monitoring (S3 Storage Lens, Cloudtrail), encryption (KMS), alerting (Cloudwatch alarms), data permissions (Lake Formation) and many more are baked in SDLF so you don't have to reinvent the wheel. 34 | 35 | ## Public References 36 | 37 | ![SDLF Public References](_static/public-references.png) -------------------------------------------------------------------------------- /sdlf-stage-lambda/src/lambda/process-object/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import PurePath 4 | 5 | from datalake_library.commons import init_logger 6 | from datalake_library.interfaces.s3_interface import S3Interface 7 | from datalake_library.sdlf import ( 8 | KMSConfiguration, 9 | S3Configuration, 10 | ) 11 | 12 | logger = init_logger(__name__) 13 | s3_prefix = os.environ["S3_PREFIX"] 14 | deployment_instance = os.environ["DEPLOYMENT_INSTANCE"] 15 | storage_deployment_instance = os.environ["STORAGE_DEPLOYMENT_INSTANCE"] 16 | 17 | 18 | def transform_object(bucket, key): 19 | s3_interface = S3Interface() 20 | # IMPORTANT: Stage bucket where transformed data must be uploaded 21 | stage_bucket = S3Configuration(instance=storage_deployment_instance).stage_bucket 22 | # Download S3 object locally to /tmp directory 23 | # The s3_helper.download_object method 24 | # returns the local path where the file was saved 25 | local_path = s3_interface.download_object(bucket, key) 26 | 27 | # Apply business business logic: 28 | # Below example is opening a JSON file and 29 | # extracting fields, then saving the file 30 | # locally and re-uploading to Stage bucket 31 | def parse(json_data): 32 | l = [] # noqa: E741 33 | for d in json_data: 34 | o = d.copy() 35 | for k in d: 36 | if type(d[k]) in [dict, list]: 37 | o.pop(k) 38 | l.append(o) 39 | 40 | return l 41 | 42 | # Reading file locally 43 | with open(local_path, "r") as raw_file: 44 | data = raw_file.read() 45 | 46 | json_data = json.loads(data) 47 | 48 | # Saving file locally to /tmp after parsing 49 | output_path = f"{PurePath(local_path).with_suffix('')}_parsed.json" 50 | with open(output_path, "w", encoding="utf-8") as write_file: 51 | json.dump(parse(json_data), write_file, ensure_ascii=False, indent=4) 52 | 53 | # Uploading file to Stage bucket at appropriate path 54 | # IMPORTANT: Build the output s3_path without the s3://stage-bucket/ 55 | s3_path = f"{s3_prefix}/{deployment_instance}/{PurePath(output_path).name}" 56 | # IMPORTANT: Notice "stage_bucket" not "bucket" 57 | kms_key = KMSConfiguration(instance=storage_deployment_instance).data_kms_key 58 | s3_interface.upload_object(output_path, stage_bucket, s3_path, kms_key=kms_key) 59 | 60 | return s3_path 61 | 62 | 63 | def lambda_handler(event, context): 64 | """Calls custom transform developed by user 65 | 66 | Arguments: 67 | event {dict} -- Dictionary with details on previous processing step 68 | context {dict} -- Dictionary with details on Lambda context 69 | 70 | Returns: 71 | {dict} -- Dictionary with Processed Bucket and Key(s) 72 | """ 73 | try: 74 | # this default Lambda expects records to be S3 events 75 | for record in event: 76 | logger.info(f"Processing file: {record['object']['key']} in {record['bucket']['name']}") 77 | try: 78 | transform_object(record["bucket"]["name"], record["object"]["key"]) 79 | record["processed"] = True 80 | except json.decoder.JSONDecodeError as e: 81 | record["processed"] = False 82 | record["error"] = repr(e) 83 | 84 | except Exception as e: 85 | logger.error("Fatal error", exc_info=True) 86 | raise e 87 | 88 | return event 89 | -------------------------------------------------------------------------------- /sdlf-stageA/lambda/stage-a-process-object/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import PurePath 3 | 4 | from datalake_library import DataLakeClient 5 | from datalake_library.commons import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def transform_object(bucket, key, team, dataset): 11 | # Initialize data lake client with team/dataset/stage parameters 12 | client = DataLakeClient(team=team, dataset=dataset, stage="a") 13 | 14 | # IMPORTANT: Stage bucket where transformed data must be uploaded 15 | stage_bucket = client.s3.stage_bucket 16 | # Download S3 object locally to /tmp directory 17 | local_path = client.s3.download_object(bucket, key) 18 | 19 | # Apply business logic: 20 | # Below example is opening a JSON file and 21 | # extracting fields, then saving the file 22 | # locally and re-uploading to Stage bucket 23 | def parse(json_data): 24 | l = [] # noqa: E741 25 | for d in json_data: 26 | o = d.copy() 27 | for k in d: 28 | if type(d[k]) in [dict, list]: 29 | o.pop(k) 30 | l.append(o) 31 | return l 32 | 33 | # Reading file locally 34 | with open(local_path, "r") as raw_file: 35 | data = raw_file.read() 36 | 37 | json_data = json.loads(data) 38 | 39 | # Saving file locally to /tmp after parsing 40 | output_path = f"{PurePath(local_path).with_suffix('')}_parsed.json" 41 | with open(output_path, "w", encoding="utf-8") as write_file: 42 | json.dump(parse(json_data), write_file, ensure_ascii=False, indent=4) 43 | 44 | # Uploading file to Stage bucket at appropriate path 45 | # IMPORTANT: Build the output s3_path without the s3://stage-bucket/ 46 | s3_path = f"pre-stage/{team}/{dataset}/{PurePath(output_path).name}" 47 | # IMPORTANT: Notice "stage_bucket" not "bucket" 48 | # you can select kms_key = client.kms.data_kms_key => to use the datalake domain data key 49 | # or use the particular team kms_key = client.kms.team_data_kms_key 50 | client.s3.upload_object(output_path, stage_bucket, s3_path, kms_key=client.kms.team_data_kms_key) 51 | # IMPORTANT S3 path(s) must be stored in a list 52 | processed_keys = [s3_path] 53 | 54 | ####################################################### 55 | # IMPORTANT 56 | # This function must return a Python list 57 | # of transformed S3 paths. Example: 58 | # ['pre-stage/engineering/legislators/persons_parsed.json'] 59 | ####################################################### 60 | 61 | return processed_keys 62 | 63 | 64 | def lambda_handler(event, context): 65 | """Calls custom transform developed by user 66 | 67 | Arguments: 68 | event {dict} -- Dictionary with details on previous processing step 69 | context {dict} -- Dictionary with details on Lambda context 70 | 71 | Returns: 72 | {dict} -- Dictionary with Processed Bucket and Key(s) 73 | """ 74 | try: 75 | logger.info("Fetching event data from previous step") 76 | bucket = event["bucket"] 77 | key = event["key"] 78 | team = event["team"] 79 | dataset = event["dataset"] 80 | 81 | logger.info("Calling user custom processing code") 82 | event["processedKeys"] = transform_object(bucket, key, team, dataset) 83 | logger.info("Successfully processed object") 84 | 85 | except Exception as e: 86 | logger.error("Fatal error", exc_info=True) 87 | raise e 88 | 89 | return event 90 | -------------------------------------------------------------------------------- /sdlf-stage-dataquality/lambda/initial-check/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | 4 | import boto3 5 | from datalake_library.commons import init_logger 6 | from datalake_library.configuration.resource_configs import DynamoConfiguration 7 | from datalake_library.interfaces.dynamo_interface import DynamoInterface 8 | 9 | logger = init_logger(__name__) 10 | 11 | dynamodb = boto3.client("dynamodb") 12 | ssm_endpoint_url = "https://ssm." + os.getenv("AWS_REGION") + ".amazonaws.com" 13 | ssm = boto3.client("ssm", endpoint_url=ssm_endpoint_url) 14 | glue_endpoint_url = "https://glue." + os.getenv("AWS_REGION") + ".amazonaws.com" 15 | glue = boto3.client("glue", endpoint_url=glue_endpoint_url) 16 | 17 | 18 | def get_glue_transform_details(bucket, team, dataset, env, pipeline, stage): 19 | dynamo_config = DynamoConfiguration() 20 | dynamo_interface = DynamoInterface(dynamo_config) 21 | 22 | transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}") 23 | 24 | glue_database = ssm.get_parameter(Name=f"/SDLF/Glue/{team}/{dataset}/DataCatalog")["Parameter"]["Value"] 25 | glue_capacity = {"NumberOfWorkers": 5} 26 | wait_time = 45 27 | 28 | dataquality_tables = [] 29 | 30 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 31 | if pipeline in transform_info.get("pipeline", {}): 32 | if stage in transform_info["pipeline"][pipeline]: 33 | logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][stage]}") 34 | glue_capacity = transform_info["pipeline"][pipeline][stage].get("glue_capacity", glue_capacity) 35 | wait_time = transform_info["pipeline"][pipeline][stage].get("wait_time", wait_time) 36 | dataquality_tables = transform_info["pipeline"][pipeline][stage].get( 37 | "dataquality_tables", dataquality_tables 38 | ) 39 | 40 | return { 41 | "DatabaseName": glue_database, 42 | "wait_time": wait_time, 43 | "dataquality_tables": dataquality_tables, 44 | **glue_capacity, 45 | } 46 | 47 | 48 | def lambda_handler(event, context): 49 | """Calls custom transform developed by user 50 | 51 | Arguments: 52 | event {dict} -- Dictionary with details on previous processing step 53 | context {dict} -- Dictionary with details on Lambda context 54 | 55 | Returns: 56 | {dict} -- Dictionary with Data Quality Job details 57 | """ 58 | try: 59 | logger.info("Fetching event data from previous step") 60 | bucket = event["body"]["bucket"] 61 | team = event["body"]["team"] 62 | pipeline = event["body"]["pipeline"] 63 | stage = event["body"]["pipeline_stage"] 64 | dataset = event["body"]["dataset"] 65 | env = event["body"]["env"] 66 | 67 | # Checking if Data Quality is enabled on tables 68 | logger.info("Querying data quality enabled tables") 69 | event["body"]["glue"] = get_glue_transform_details(bucket, team, dataset, env, pipeline, stage) 70 | event["body"]["glue"]["crawler_name"] = "-".join(["sdlf", team, dataset, "post-stage-crawler"]) 71 | logger.info(event["body"]["glue"]) 72 | 73 | map_input = [] 74 | for table in event["body"]["glue"]["dataquality_tables"]: 75 | map_item = copy.deepcopy(event) 76 | map_item["body"]["glue"]["TableName"] = table 77 | map_input.append(map_item) 78 | 79 | except Exception as e: 80 | logger.error("Fatal error", exc_info=True) 81 | raise e 82 | return {"dataquality": map_input} 83 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/clean-up.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | STORAGE_DEPLOYMENT_INSTANCE=dev 4 | DATASET_DEPLOYMENT_INSTANCE=dev 5 | TEAM_NAME=engineering 6 | #PRINCIPAL= 7 | 8 | # echo "Granting Drop on Glue DBs" 9 | # SDLF_ORG=$(aws ssm get-parameter --name "/sdlf/storage/rOrganization/$STORAGE_DEPLOYMENT_INSTANCE" --query "Parameter.Value" --output text) 10 | # for DB in $(aws glue get-databases | jq -r '.[][].Name') 11 | # do 12 | # case "$DB" in 13 | # $SDLF_ORG*) aws lakeformation grant-permissions --principal DataLakePrincipalIdentifier="$PRINCIPAL" --permissions DROP --resource $(echo \'{\"Database\":{\"Name\":\"$DB\"}}\' | tr -d \');; 14 | # *) echo "Skipping non-SDLF database" ;; 15 | # esac 16 | # done 17 | 18 | echo "Fetch KMS keys ARN - SSM parameters won't be available once stacks have been deleted" 19 | declare -a KEYS=("/sdlf/storage/rKMSKey/$STORAGE_DEPLOYMENT_INSTANCE" 20 | "/sdlf/dataset/rKMSInfraKey/$DATASET_DEPLOYMENT_INSTANCE" 21 | "/sdlf/dataset/rKMSDataKey/$DATASET_DEPLOYMENT_INSTANCE" 22 | "/SDLF/KMS/$TEAM_NAME/InfraKeyId" 23 | ) 24 | KEYS_ARN=() 25 | for KEY in "${KEYS[@]}" 26 | do 27 | echo "Finding $KEY ARN" 28 | if KEY_ARN=$(aws ssm get-parameter --name "$KEY" --query "Parameter.Value" --output text); then 29 | KEYS_ARN+=("$KEY_ARN") 30 | else 31 | echo "Key does not exist, skipping" 32 | fi 33 | done 34 | 35 | echo "Emptying SDLF buckets..." 36 | declare -a BUCKETS=("/sdlf/storage/rArtifactsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 37 | "/sdlf/storage/rRawBucket/$STORAGE_DEPLOYMENT_INSTANCE" 38 | "/sdlf/storage/rStageBucket/$STORAGE_DEPLOYMENT_INSTANCE" 39 | "/sdlf/storage/rAnalyticsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 40 | "/sdlf/storage/rAthenaBucket/$STORAGE_DEPLOYMENT_INSTANCE" 41 | "/sdlf/storage/rS3AccessLogsBucket/$STORAGE_DEPLOYMENT_INSTANCE" 42 | ) 43 | for BUCKET in "${BUCKETS[@]}" 44 | do 45 | echo "Finding $BUCKET bucket name" 46 | if S3_BUCKET=$(aws ssm get-parameter --name "$BUCKET" --query "Parameter.Value" --output text); then 47 | echo "Emptying $S3_BUCKET" 48 | aws s3 rm "s3://$S3_BUCKET" --recursive 49 | if [ "$(aws s3api get-bucket-versioning --bucket "$S3_BUCKET" --output text)" == "Enabled" ]; then 50 | objects_versions=$(aws s3api list-object-versions --bucket "$S3_BUCKET" --output=json --query='{Objects: Versions[].{Key:Key,VersionId:VersionId}}') 51 | if [ "$(jq -r ".Objects" <<< "$objects_versions")" != "null" ]; then 52 | aws s3api delete-objects --bucket "$S3_BUCKET" --delete "$objects_versions" 53 | fi 54 | fi 55 | else 56 | echo "Bucket does not exist, skipping" 57 | fi 58 | done 59 | 60 | echo "Deleting SDLF stacks..." 61 | STACKS=$(aws cloudformation list-stacks --query "StackSummaries[?starts_with(StackName,'sdlf-') && StackStatus!='DELETE_COMPLETE']" | jq -r "sort_by(.CreationTime) | reverse[] | select(.ParentId == null) | .StackName") 62 | for STACK in $STACKS 63 | do 64 | echo "Deleting stack $STACK" 65 | aws cloudformation delete-stack --stack-name "$STACK" 66 | done 67 | for STACK in $STACKS 68 | do 69 | echo "Waiting for $STACK stack delete to complete ..." && aws cloudformation wait stack-delete-complete --stack-name "$STACK" && echo "Finished delete successfully!" 70 | done 71 | 72 | echo "Deleting KMS keys" 73 | for KEY_ARN in "${KEYS_ARN[@]}" 74 | do 75 | echo "Deleting $KEY_ARN" 76 | aws kms schedule-key-deletion --key-id "$KEY_ARN" --pending-window-in-days 7 2>&1 77 | done 78 | -------------------------------------------------------------------------------- /sdlf-stageB/lambda/stage-b-routing/src/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datalake_library import DataLakeClient 5 | from datalake_library.commons import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def fetch_messages(team, pipeline, stage): 11 | client = DataLakeClient(team=team, pipeline=pipeline, stage=stage) 12 | # Default values, change if required 13 | min_items_to_process = 1 14 | max_items_to_process = 100 15 | 16 | logger.info(f"Pipeline is {pipeline}, stage is {stage}") 17 | logger.info("Querying {}-{}-{} objects waiting for processing".format(team, pipeline, stage)) 18 | 19 | keys_to_process = client.sqs.receive_min_max_messages( 20 | min_items_to_process, max_items_to_process, client.sqs.stage_queue_url 21 | ) 22 | 23 | logger.info("{} Objects ready for processing".format(len(keys_to_process))) 24 | return list(set(keys_to_process)) 25 | 26 | 27 | def lambda_handler(event, context): 28 | """Checks if any items need processing and triggers state machine 29 | Arguments: 30 | event {dict} -- Dictionary with details on what needs processing 31 | context {dict} -- Dictionary with details on Lambda context 32 | """ 33 | try: 34 | keys_to_process = [] 35 | trigger_type = event.get("trigger_type") # this is set by the schedule event rule 36 | if trigger_type: # scheduled 37 | records = fetch_messages(event["team"], event["pipeline"], event["pipeline_stage"]) 38 | else: 39 | records = event["Records"] 40 | logger.info("Received {} messages".format(len(records))) 41 | response = {} 42 | for record in records: 43 | if trigger_type: 44 | event_body = json.loads(json.loads(record)["output"])[0] 45 | else: 46 | event_body = json.loads(json.loads(record["body"])["output"])[0] 47 | 48 | team = event_body["team"] 49 | pipeline = event_body["pipeline"] 50 | stage = os.environ["PIPELINE_STAGE"] 51 | dataset = event_body["dataset"] 52 | org = event_body["org"] 53 | domain = event_body["domain"] 54 | env = event_body["env"] 55 | 56 | client = DataLakeClient(team=team, pipeline=pipeline, stage=stage) 57 | stage_bucket = client.s3.stage_bucket 58 | keys_to_process.extend(event_body["processedKeys"]) 59 | 60 | logger.info("{} Objects ready for processing".format(len(keys_to_process))) 61 | keys_to_process = list(set(keys_to_process)) 62 | 63 | response = { 64 | "statusCode": 200, 65 | "body": { 66 | "bucket": stage_bucket, 67 | "keysToProcess": keys_to_process, 68 | "team": team, 69 | "pipeline": pipeline, 70 | "pipeline_stage": stage, 71 | "dataset": dataset, 72 | "org": org, 73 | "domain": domain, 74 | "env": env, 75 | }, 76 | } 77 | if response: 78 | logger.info("Starting State Machine Execution") 79 | client.states.run_state_machine(client.states.state_machine_arn, response) 80 | except Exception as e: 81 | # If failure send to DLQ 82 | if keys_to_process: 83 | client = DataLakeClient(team=team, pipeline=pipeline, stage=stage) 84 | client.sqs.send_message_to_fifo_queue(json.dumps(response), "failed", client.sqs.stage_dlq_url) 85 | logger.error("Fatal error", exc_info=True) 86 | raise e 87 | -------------------------------------------------------------------------------- /sdlf-datalakeLibrary/python/datalake_library/interfaces/sqs_interface.py: -------------------------------------------------------------------------------- 1 | import math 2 | import uuid 3 | 4 | from .base_interface import BaseInterface 5 | 6 | 7 | class SQSInterface(BaseInterface): 8 | def __init__(self, team=None, dataset=None, pipeline=None, stage=None, log_level=None, session=None): 9 | super().__init__(team, dataset, pipeline, stage, log_level, session) 10 | 11 | def _initialize_client(self): 12 | """Initialize SQS client""" 13 | self.sqs = self.session.client("sqs", config=self.session_config) 14 | 15 | def _load_config(self): 16 | """Load SQS-specific configuration from SSM""" 17 | if self.team and self.stage and self.pipeline: 18 | self.stage_queue = self._get_ssm_parameter(f"/SDLF/SQS/{self.team}/{self.pipeline}{self.stage}Queue") 19 | self.stage_dlq = self._get_ssm_parameter(f"/SDLF/SQS/{self.team}/{self.pipeline}{self.stage}DLQ") 20 | 21 | @property 22 | def stage_queue_url(self): 23 | """Get stage queue URL""" 24 | return self.sqs.get_queue_url(QueueName=self.stage_queue)["QueueUrl"] 25 | 26 | @property 27 | def stage_dlq_url(self): 28 | """Get stage DLQ URL""" 29 | return self.sqs.get_queue_url(QueueName=self.stage_dlq)["QueueUrl"] 30 | 31 | def receive_messages(self, max_num_messages=1, queue_url=None): 32 | queue_url = queue_url or self.stage_queue_url 33 | messages = self.sqs.receive_message( 34 | QueueUrl=queue_url, MaxNumberOfMessages=max_num_messages, WaitTimeSeconds=1 35 | ).get("Messages", []) 36 | for message in messages: 37 | self.sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"]) 38 | return messages 39 | 40 | def receive_min_max_messages(self, min_items_process=1, max_items_process=100, queue_url=None): 41 | """Gets max_items_process messages from an SQS queue. 42 | :param min_items_process: Minimum number of items to process. 43 | :param max_items_process: Maximum number of items to process. 44 | :return messages obtained 45 | """ 46 | messages = [] 47 | queue_url = queue_url or self.stage_queue_url 48 | num_messages_queue = int( 49 | self.sqs.get_queue_attributes(QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages"])[ 50 | "Attributes" 51 | ]["ApproximateNumberOfMessages"] 52 | ) 53 | 54 | # If not enough items to process, break with no messages 55 | if (num_messages_queue == 0) or (min_items_process > num_messages_queue): 56 | self.logger.info("Not enough messages - exiting") 57 | return messages 58 | 59 | # Only pull batch sizes of max_batch_size 60 | num_messages_queue = min(num_messages_queue, max_items_process) 61 | max_batch_size = 10 62 | batch_sizes = [max_batch_size] * math.floor(num_messages_queue / max_batch_size) 63 | if num_messages_queue % max_batch_size > 0: 64 | batch_sizes += [num_messages_queue % max_batch_size] 65 | 66 | for batch_size in batch_sizes: 67 | resp_msg = self.receive_messages(max_num_messages=batch_size) 68 | try: 69 | messages.extend(message["Body"] for message in resp_msg) 70 | except KeyError: 71 | break 72 | return messages 73 | 74 | def send_message_to_fifo_queue(self, message, group_id, queue_url=None): 75 | queue_url = queue_url or self.stage_queue_url 76 | self.sqs.send_message( 77 | QueueUrl=queue_url, MessageBody=message, MessageGroupId=group_id, MessageDeduplicationId=str(uuid.uuid4()) 78 | ) 79 | -------------------------------------------------------------------------------- /sdlf-stage-ecsfargate/src/state-machine/stage-ecsfargate.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Simple ECS Fargate-based transform", 3 | "StartAt": "Try", 4 | "States": { 5 | "Try": { 6 | "Type": "Parallel", 7 | "Branches": [ 8 | { 9 | "StartAt": "Pass", 10 | "States": { 11 | "Pass": { 12 | "Type": "Pass", 13 | "Next": "Records", 14 | "Parameters": { 15 | "Items.$": "States.StringToJson($)" 16 | } 17 | }, 18 | "Records": { 19 | "Type": "Map", 20 | "ItemProcessor": { 21 | "ProcessorConfig": { 22 | "Mode": "DISTRIBUTED", 23 | "ExecutionType": "STANDARD" 24 | }, 25 | "StartAt": "Execute ECS Fargate Transformation", 26 | "States": { 27 | "Execute ECS Fargate Transformation": { 28 | "Type": "Task", 29 | "Resource": "arn:aws:states:::ecs:runTask.sync", 30 | "Parameters": { 31 | "LaunchType": "FARGATE", 32 | "Cluster": "$.Items[0].transform.ecsfargate_cluster", 33 | "TaskDefinition": "$.Items[0].transform.transform" 34 | }, 35 | "End": true 36 | } 37 | } 38 | }, 39 | "Next": "Post-update Catalog", 40 | "Label": "Records", 41 | "MaxConcurrency": 50, 42 | "ToleratedFailurePercentage": 100, 43 | "ItemBatcher": { 44 | "MaxItemsPerBatch": 1 45 | }, 46 | "InputPath": "$.Items" 47 | }, 48 | "Post-update Catalog": { 49 | "Type": "Task", 50 | "Resource": "arn:aws:states:::lambda:invoke", 51 | "ResultPath": null, 52 | "Parameters": { 53 | "Payload.$": "$", 54 | "FunctionName": "${lPostMetadata}:$LATEST" 55 | }, 56 | "Retry": [ 57 | { 58 | "ErrorEquals": [ 59 | "Lambda.ServiceException", 60 | "Lambda.AWSLambdaException", 61 | "Lambda.SdkClientException", 62 | "Lambda.TooManyRequestsException" 63 | ], 64 | "IntervalSeconds": 2, 65 | "MaxAttempts": 6, 66 | "BackoffRate": 2 67 | } 68 | ], 69 | "End": true 70 | } 71 | } 72 | } 73 | ], 74 | "End": true, 75 | "Catch": [ 76 | { 77 | "ErrorEquals": [ 78 | "States.ALL" 79 | ], 80 | "ResultPath": null, 81 | "Next": "Error" 82 | } 83 | ] 84 | }, 85 | "Error": { 86 | "Type": "Task", 87 | "Resource": "arn:aws:states:::lambda:invoke", 88 | "OutputPath": "$.Payload", 89 | "Parameters": { 90 | "Payload.$": "$", 91 | "FunctionName": "${lError}:$LATEST" 92 | }, 93 | "Retry": [ 94 | { 95 | "ErrorEquals": [ 96 | "Lambda.ServiceException", 97 | "Lambda.AWSLambdaException", 98 | "Lambda.SdkClientException", 99 | "Lambda.TooManyRequestsException" 100 | ], 101 | "IntervalSeconds": 2, 102 | "MaxAttempts": 6, 103 | "BackoffRate": 2 104 | } 105 | ], 106 | "Next": "Fail" 107 | }, 108 | "Fail": { 109 | "Type": "Fail" 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pflag=false 3 | tflag=false 4 | dflag=false 5 | rflag=false 6 | 7 | DIRNAME=$(dirname "$0") 8 | 9 | usage () { echo " 10 | -h -- Opens up this help message 11 | -p -- Name of the AWS profile to use 12 | -t -- Team name (required) 13 | -d -- Dataset name (required) 14 | -r -- AWS region (required) 15 | "; } 16 | options=':p:t:d:r:h' 17 | while getopts "$options" option 18 | do 19 | case "$option" in 20 | p ) pflag=true; PROFILE=$OPTARG;; 21 | t ) tflag=true; TEAM=$OPTARG;; 22 | d ) dflag=true; DATASET=$OPTARG;; 23 | r ) rflag=true; REGION=$OPTARG;; 24 | h ) usage; exit;; 25 | \? ) echo "Unknown option: -$OPTARG" >&2; exit 1;; 26 | : ) echo "Missing option argument for -$OPTARG" >&2; exit 1;; 27 | * ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;; 28 | esac 29 | done 30 | 31 | if ! "$tflag" 32 | then 33 | echo "Team name is required. Use -t " >&2 34 | usage 35 | exit 1 36 | fi 37 | 38 | if ! "$dflag" 39 | then 40 | echo "Dataset name is required. Use -d " >&2 41 | usage 42 | exit 1 43 | fi 44 | 45 | if ! "$rflag" 46 | then 47 | echo "AWS region is required. Use -r " >&2 48 | usage 49 | exit 1 50 | fi 51 | 52 | if "$pflag" 53 | then 54 | echo "using AWS profile $PROFILE..." >&2 55 | fi 56 | 57 | echo "using team: $TEAM" >&2 58 | echo "using dataset: $DATASET" >&2 59 | echo "using region: $REGION" >&2 60 | 61 | ARTIFACTS_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/SDLF2/S3/ArtifactsBucket" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 62 | aws s3 cp "$DIRNAME/scripts/legislators-glue-job.py" "s3://$ARTIFACTS_BUCKET/artifacts/" ${PROFILE:+--profile "$PROFILE"} 63 | 64 | mkdir -p "$DIRNAME"/output 65 | 66 | function send_data() 67 | { 68 | ORIGIN="$DIRNAME/data/" 69 | 70 | RAW_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/SDLF2/S3/RawBucket" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 71 | KMS_KEY=$(aws --region "$REGION" ssm get-parameter --name "/SDLF/KMS/$TEAM/DataKeyId" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"}) 72 | 73 | S3_DESTINATION=s3://$RAW_BUCKET/ 74 | COUNT=0 75 | for FILE in "$ORIGIN"/*.json; 76 | do 77 | (( COUNT++ )) || true 78 | aws s3 cp "$FILE" "${S3_DESTINATION}${TEAM}/${DATASET}/" --sse aws:kms --sse-kms-key-id "$KMS_KEY" ${PROFILE:+--profile "$PROFILE"} 79 | echo "transferred $COUNT files" 80 | done 81 | } 82 | 83 | VPC_SUPPORT=$(aws --region "$REGION" ssm get-parameter --name "/SDLF/VPC/Enabled" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"} 2>/dev/null) 84 | if [ -z "$VPC_SUPPORT" ] 85 | then 86 | aws --region "$REGION" ssm put-parameter --name "/SDLF/VPC/Enabled" --value "false" --type String ${PROFILE:+--profile "$PROFILE"} 87 | fi 88 | 89 | aws cloudformation package --template-file "$DIRNAME"/scripts/legislators-glue-job.yaml \ 90 | --s3-bucket "$ARTIFACTS_BUCKET" \ 91 | ${PROFILE:+--profile "$PROFILE"} \ 92 | --output-template-file "$DIRNAME"/output/packaged-template.yaml 93 | 94 | STACK_NAME="sdlf-${TEAM}-${DATASET}-glue-job" 95 | aws cloudformation deploy \ 96 | --s3-bucket "$ARTIFACTS_BUCKET" --s3-prefix sdlf-utils \ 97 | --stack-name "$STACK_NAME" \ 98 | --template-file "$DIRNAME"/output/packaged-template.yaml \ 99 | --parameter-overrides pTeamName="$TEAM" pDatasetName="$DATASET" \ 100 | --tags Framework=sdlf Team="$TEAM" Dataset="$DATASET" \ 101 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \ 102 | --region "$REGION" \ 103 | ${PROFILE:+--profile "$PROFILE"} || exit 1 104 | 105 | send_data 106 | -------------------------------------------------------------------------------- /sdlf-team/README.md: -------------------------------------------------------------------------------- 1 | # sdlf-team 2 | 3 | !!! note 4 | `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"} 9 | 10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies. 11 | 12 | As such `sdlf-team` is mostly about permissions. 13 | 14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`. 15 | 16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 17 | 18 | !!! warning 19 | The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members. 20 | 21 | ## Usage 22 | 23 | ### CloudFormation with [sdlf-cicd](cicd.md) 24 | 25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 26 | 27 | ``` 28 | rExample: 29 | Type: awslabs::sdlf::team::MODULE 30 | Properties: 31 | pPipelineReference: !Ref pPipelineReference 32 | pTeamName: industry 33 | pEnvironment: dev 34 | pSNSNotificationsEmail: nobody@amazon.com 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Athena/{team}/WorkgroupName` | Team Athena workgroup name | | 44 | | `/SDLF/EventBridge/{team}/EventBusName` | Name of the team dedicated event bus | | 45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName` | Name of the team dedicated schedule group | | 46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name | | 47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn` | IAM Role ARN for Glue crawlers | | 48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary` | ARN of the permissions boundary IAM Managed policy for the team | | 49 | | `/SDLF/KMS/${pTeamName}/DataKeyId` | ARN of the team KMS data key | | 50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId` | ARN of the team KMS infrastructure key | | 51 | | `/SDLF/SNS/${pTeamName}/Notifications` | ARN of the team-specific SNS Topic | | 52 | -------------------------------------------------------------------------------- /docs/constructs/team.md: -------------------------------------------------------------------------------- 1 | # sdlf-team 2 | 3 | !!! note 4 | `sdlf-team` is defined in the [sdlf-team](https://github.com/awslabs/aws-serverless-data-lake-framework/tree/main/sdlf-team) folder of the [SDLF repository](https://github.com/awslabs/aws-serverless-data-lake-framework). 5 | 6 | ## Infrastructure 7 | 8 | ![SDLF Team](../_static/sdlf-team.png){: style="width:80%"} 9 | 10 | A team is a group of individuals that wish to onboard into the data lake. It can be a pizza team of developers or an entire Business Unit such as the marketing or finance department. A team is responsible for their data pipelines, datasets and repositories which are unique to the team and completely segregated from others. Teams are also isolated from both an operational and security standpoint through least-privilege IAM policies. 11 | 12 | As such `sdlf-team` is mostly about permissions. 13 | 14 | The two `Pipelines` and `Datasets` Lambda functions (and related resources) are used to populate the DynamoDB tables `octagon-Pipelines-{environment}` and `octagon-Datasets-{environment}` from `sdlf-foundations`. 15 | 16 | SSM parameters holding names or ARNs are created for all resources that may be used by other modules. 17 | 18 | !!! warning 19 | The data lake admin team should be the only one with write access to the `sdlf-team` code base, as it is used to restrict permissions given to team members. 20 | 21 | ## Usage 22 | 23 | ### CloudFormation with [sdlf-cicd](cicd.md) 24 | 25 | Read the official [SDLF workshop](https://sdlf.workshop.aws/) for an end-to-end deployment example. 26 | 27 | ``` 28 | rExample: 29 | Type: awslabs::sdlf::team::MODULE 30 | Properties: 31 | pPipelineReference: !Ref pPipelineReference 32 | pTeamName: industry 33 | pEnvironment: dev 34 | pSNSNotificationsEmail: nobody@amazon.com 35 | ``` 36 | 37 | ## Interface 38 | 39 | Interfacing with other modules is done through [SSM Parameters](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). `sdlf-team` publishes the following parameters: 40 | 41 | | SSM Parameter | Description | Comment | 42 | | ------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------- | 43 | | `/SDLF/Athena/{team}/WorkgroupName` | Team Athena workgroup name | | 44 | | `/SDLF/EventBridge/{team}/EventBusName` | Name of the team dedicated event bus | | 45 | | `/SDLF/EventBridge/{team}/ScheduleGroupName` | Name of the team dedicated schedule group | | 46 | | `/SDLF/Glue/${pTeamName}/SecurityConfigurationId` | Glue security configuration name | | 47 | | `/SDLF/IAM/${pTeamName}/CrawlerRoleArn` | IAM Role ARN for Glue crawlers | | 48 | | `/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary` | ARN of the permissions boundary IAM Managed policy for the team | | 49 | | `/SDLF/KMS/${pTeamName}/DataKeyId` | ARN of the team KMS data key | | 50 | | `/SDLF/KMS/${pTeamName}/InfraKeyId` | ARN of the team KMS infrastructure key | | 51 | | `/SDLF/SNS/${pTeamName}/Notifications` | ARN of the team-specific SNS Topic | | 52 | -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-dataset.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/_static/drawio/sdlf-monitoring.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/_static/sdlf-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Produced by OmniGraffle 7.17.5\n2020-11-27 16:09:02 +0000 6 | 7 | Canvas 1 8 | 9 | Layer 2 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /sdlf-cicd/template-glue-job.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Transform: AWS::LanguageExtensions 3 | Description: Deploy Glue jobs 4 | 5 | Parameters: 6 | pArtifactsBucket: 7 | Description: The artifacts bucket used by CodeBuild and CodePipeline 8 | Type: String 9 | Default: "{{resolve:ssm:/SDLF2/S3/ArtifactsBucket}" 10 | pTeamName: 11 | Description: Name of the team (all lowercase, no symbols or spaces) 12 | Type: String 13 | pGlueJobs: 14 | Description: List of glue job names 15 | Type: CommaDelimitedList 16 | AllowedPattern: "^[a-zA-Z0-9\\-]*$" 17 | pEnableVpc: 18 | Description: Deploy SDLF resources in a VPC 19 | Type: AWS::SSM::Parameter::Value 20 | Default: /SDLF/VPC/Enabled 21 | 22 | Conditions: 23 | GlueJobsNotEmpty: !Not 24 | - !Equals 25 | - !Join ["", !Ref pGlueJobs] 26 | - "" 27 | RunInVpc: !Equals [!Ref pEnableVpc, true] 28 | 29 | Resources: 30 | rGlueRole: 31 | Type: AWS::IAM::Role 32 | Properties: 33 | Path: /service-role/ 34 | PermissionsBoundary: !Sub "{{resolve:ssm:/SDLF/IAM/${pTeamName}/TeamPermissionsBoundary}}" 35 | AssumeRolePolicyDocument: 36 | Version: 2012-10-17 37 | Statement: 38 | - Effect: Allow 39 | Principal: 40 | Service: 41 | - glue.amazonaws.com 42 | Action: 43 | - sts:AssumeRole 44 | ManagedPolicyArns: 45 | - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole 46 | - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess 47 | - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess 48 | Policies: 49 | - PolicyName: !Sub sdlf-${pTeamName}-glue-job 50 | PolicyDocument: 51 | Version: 2012-10-17 52 | Statement: 53 | - Effect: Allow 54 | Action: 55 | - kms:CreateGrant 56 | - kms:Decrypt 57 | - kms:DescribeKey 58 | - kms:Encrypt 59 | - kms:GenerateDataKey* 60 | - kms:ReEncrypt* 61 | Resource: 62 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/InfraKeyId}}" 63 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/DataKeyId}}" 64 | - "{{resolve:ssm:/SDLF2/KMS/KeyArn}}" 65 | 66 | "Fn::ForEach::GlueJobResources": 67 | - GlueJobName 68 | - !Ref pGlueJobs 69 | - "r&{GlueJobName}GlueJob": 70 | Type: AWS::Glue::Job 71 | Condition: GlueJobsNotEmpty 72 | Properties: 73 | Command: 74 | Name: glueetl 75 | PythonVersion: "3" 76 | ScriptLocation: !Sub s3://${pArtifactsBucket}/${pTeamName}/transforms/${GlueJobName}.py 77 | DefaultArguments: !If 78 | - RunInVpc 79 | - 80 | "--job-bookmark-option": job-bookmark-disable 81 | "--enable-glue-datacatalog": "true" 82 | "--enable-continuous-cloudwatch-log": "true" 83 | "--enable-continuous-log-filter": "true" 84 | "--enable-metrics": "true" 85 | "--disable-proxy-v2": "true" 86 | - 87 | "--job-bookmark-option": job-bookmark-disable 88 | "--enable-glue-datacatalog": "true" 89 | "--enable-continuous-cloudwatch-log": "true" 90 | "--enable-continuous-log-filter": "true" 91 | "--enable-metrics": "true" 92 | ExecutionProperty: 93 | MaxConcurrentRuns: 10 94 | MaxRetries: 0 95 | MaxCapacity: 2.0 96 | GlueVersion: "4.0" 97 | Name: !Sub 98 | - sdlf-${pTeamName}-${BaseGlueJobName} 99 | - BaseGlueJobName: !Select [0, !Split ["-", !Ref GlueJobName]] 100 | SecurityConfiguration: !Sub "{{resolve:ssm:/SDLF/Glue/${pTeamName}/SecurityConfigurationId}}" 101 | Role: !Ref rGlueRole 102 | Connections: !If 103 | - RunInVpc 104 | - Connections: 105 | - BUILDSTEPVARIABLE_GLUECONNECTIONS 106 | - !Ref "AWS::NoValue" 107 | -------------------------------------------------------------------------------- /sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Glue Job Sample 3 | 4 | Parameters: 5 | pTeamName: 6 | Type: String 7 | Description: Team name for resource naming 8 | AllowedPattern: "[a-z0-9]{2,12}" 9 | ConstraintDescription: Must be between 2 and 12 characters, lowercase letters and numbers only 10 | pDatasetName: 11 | Type: String 12 | Description: Dataset name for resource naming 13 | AllowedPattern: "[a-z0-9]{2,12}" 14 | ConstraintDescription: Must be between 2 and 12 characters, lowercase letters and numbers only 15 | pPipelineDeploymentInstance: 16 | Type: String 17 | Description: specific pipeline stage deployment instance this job is for 18 | Default: mainB 19 | pArtifactsBucket: 20 | Description: S3 bucket used to store artifacts (from CICD or generated by data pipelines) 21 | Type: AWS::SSM::Parameter::Value 22 | Default: /SDLF2/S3/ArtifactsBucket 23 | pEnableVpc: 24 | Description: Deploy SDLF resources in a VPC 25 | Type: AWS::SSM::Parameter::Value 26 | Default: /SDLF/VPC/Enabled 27 | 28 | Conditions: 29 | RunInVpc: !Equals [!Ref pEnableVpc, true] 30 | 31 | Resources: 32 | rGlueRole: 33 | Type: AWS::IAM::Role 34 | Properties: 35 | RoleName: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-role 36 | Path: /service-role/ 37 | AssumeRolePolicyDocument: 38 | Version: 2012-10-17 39 | Statement: 40 | - Effect: Allow 41 | Principal: 42 | Service: 43 | - glue.amazonaws.com 44 | Action: 45 | - sts:AssumeRole 46 | ManagedPolicyArns: 47 | - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole 48 | - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess 49 | - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess 50 | Policies: 51 | - PolicyName: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-policy 52 | PolicyDocument: 53 | Version: 2012-10-17 54 | Statement: 55 | - Effect: Allow 56 | Action: 57 | - kms:CreateGrant 58 | - kms:Decrypt 59 | - kms:DescribeKey 60 | - kms:Encrypt 61 | - kms:GenerateDataKey* 62 | - kms:ReEncrypt* 63 | Resource: 64 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/InfraKeyId:1}}" 65 | - !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/DataKeyId:1}}" 66 | - "{{resolve:ssm:/SDLF2/KMS/KeyArn:1}}" 67 | 68 | rGlueJob: 69 | Type: AWS::Glue::Job 70 | Properties: 71 | Command: 72 | Name: glueetl 73 | PythonVersion: "3" 74 | ScriptLocation: !Sub s3://${pArtifactsBucket}/artifacts/legislators-glue-job.py 75 | DefaultArguments: !If 76 | - RunInVpc 77 | - 78 | "--job-bookmark-option": job-bookmark-enable 79 | "--enable-metrics": "" 80 | "--disable-proxy-v2": "true" 81 | - 82 | "--job-bookmark-option": job-bookmark-enable 83 | "--enable-metrics": "" 84 | ExecutionProperty: 85 | MaxConcurrentRuns: 3 86 | MaxRetries: 0 87 | MaxCapacity: 2.0 88 | GlueVersion: "4.0" 89 | Name: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-job 90 | SecurityConfiguration: !Sub "{{resolve:ssm:/SDLF/Glue/${pTeamName}/SecurityConfigurationId:1}}" 91 | Role: !Ref rGlueRole 92 | 93 | Outputs: 94 | oGlueJobName: 95 | Description: Name of the Glue job 96 | Value: !Ref rGlueJob 97 | Export: 98 | Name: !Sub ${AWS::StackName}-glue-job-name 99 | 100 | oGlueRoleArn: 101 | Description: ARN of the Glue job role 102 | Value: !GetAtt rGlueRole.Arn 103 | Export: 104 | Name: !Sub ${AWS::StackName}-glue-role-arn 105 | 106 | oPipelineDeploymentInstance: 107 | Description: Pipeline deployment instance 108 | Value: !Ref pPipelineDeploymentInstance 109 | -------------------------------------------------------------------------------- /sdlf-stageB/state-machine/stage-b.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Simple Glue-based transform", 3 | "StartAt": "Try", 4 | "States": { 5 | "Try": { 6 | "Type": "Parallel", 7 | "Branches": [ 8 | { 9 | "StartAt": "Pre-update Catalog", 10 | "States": { 11 | "Pre-update Catalog": { 12 | "Type": "Task", 13 | "Resource": "arn:aws:states:::lambda:invoke", 14 | "OutputPath": "$.Payload", 15 | "Parameters": { 16 | "Payload.$": "$", 17 | "FunctionName": "${lStep1}:$LATEST" 18 | }, 19 | "Retry": [ 20 | { 21 | "ErrorEquals": [ 22 | "Lambda.ServiceException", 23 | "Lambda.AWSLambdaException", 24 | "Lambda.SdkClientException", 25 | "Lambda.TooManyRequestsException" 26 | ], 27 | "IntervalSeconds": 2, 28 | "MaxAttempts": 6, 29 | "BackoffRate": 2 30 | } 31 | ], 32 | "Next": "Process Data" 33 | }, 34 | "Process Data": { 35 | "Type": "Task", 36 | "Resource": "arn:aws:states:::glue:startJobRun.sync", 37 | "ResultPath": "$.body.glue.run_output", 38 | "Parameters": { 39 | "JobName.$": "$.body.glue.job_name", 40 | "WorkerType.$": "$.body.glue.WorkerType", 41 | "NumberOfWorkers.$": "$.body.glue.NumberOfWorkers", 42 | "Arguments.$": "$.body.glue.arguments" 43 | }, 44 | "Next": "Run Glue Crawler" 45 | }, 46 | "Run Glue Crawler": { 47 | "Type": "Task", 48 | "Resource": "arn:aws:states:::aws-sdk:glue:startCrawler", 49 | "ResultPath": null, 50 | "Parameters": { 51 | "Name.$": "$.body.glue.crawler_name" 52 | }, 53 | "Next": "Wait X Seconds" 54 | }, 55 | "Wait X Seconds": { 56 | "Type": "Wait", 57 | "SecondsPath": "$.body.glue.wait_time", 58 | "Next": "GetCrawler" 59 | }, 60 | "GetCrawler": { 61 | "Type": "Task", 62 | "Resource": "arn:aws:states:::aws-sdk:glue:getCrawler", 63 | "ResultPath": "$.body.glue.crawler_response", 64 | "Parameters": { 65 | "Name.$": "$.body.glue.crawler_name" 66 | }, 67 | "Next": "Crawler Complete?" 68 | }, 69 | "Crawler Complete?": { 70 | "Type": "Choice", 71 | "Choices": [ 72 | { 73 | "Variable": "$.body.glue.crawler_response.Crawler.State", 74 | "StringEquals": "READY", 75 | "Next": "Success" 76 | } 77 | ], 78 | "Default": "Wait X Seconds" 79 | }, 80 | "Success": { 81 | "Type": "Succeed" 82 | } 83 | } 84 | } 85 | ], 86 | "End": true, 87 | "Catch": [ 88 | { 89 | "ErrorEquals": [ 90 | "States.ALL" 91 | ], 92 | "ResultPath": null, 93 | "Next": "Error" 94 | } 95 | ] 96 | }, 97 | "Error": { 98 | "Type": "Task", 99 | "Resource": "arn:aws:states:::lambda:invoke", 100 | "OutputPath": "$.Payload", 101 | "Parameters": { 102 | "Payload.$": "$", 103 | "FunctionName": "${lError}:$LATEST" 104 | }, 105 | "Retry": [ 106 | { 107 | "ErrorEquals": [ 108 | "Lambda.ServiceException", 109 | "Lambda.AWSLambdaException", 110 | "Lambda.SdkClientException", 111 | "Lambda.TooManyRequestsException" 112 | ], 113 | "IntervalSeconds": 2, 114 | "MaxAttempts": 6, 115 | "BackoffRate": 2 116 | } 117 | ], 118 | "Next": "Fail" 119 | }, 120 | "Fail": { 121 | "Type": "Fail" 122 | } 123 | } 124 | } --------------------------------------------------------------------------------