├── .terraform-version ├── glue ├── shared │ └── glue_shared_lib │ │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_helpers.py │ │ ├── test_glue_functions.py │ │ ├── test_utils.py │ │ └── test_boto3_helpers.py │ │ ├── requirements-dev.txt │ │ ├── src │ │ └── glue_shared │ │ │ ├── exceptions.py │ │ │ ├── str2obj.py │ │ │ ├── __init__.py │ │ │ ├── spark_helpers.py │ │ │ ├── helpers.py │ │ │ ├── pandas_helpers.py │ │ │ ├── argument_handlers.py │ │ │ ├── defaults.py │ │ │ ├── glue_interface.py │ │ │ └── boto3_helpers.py │ │ ├── setup.py │ │ └── Makefile └── data_sources │ ├── dummy_job │ └── dummy_transition │ │ ├── requirements.txt │ │ ├── dummy_transition.py │ │ └── Makefile │ └── ds1 │ ├── refined_to_curated │ ├── requirements.txt │ ├── config.py │ ├── refined_to_curated.py │ └── Makefile │ └── raw_to_refined │ ├── requirements.txt │ ├── config.py │ ├── raw_to_refined.py │ └── Makefile ├── _images ├── job_1.png ├── job_2.png └── job_3.png ├── arch_diagram.png ├── terraform ├── modules │ ├── glue-workflow │ │ ├── outputs.tf │ │ ├── main.tf │ │ ├── variables.tf │ │ └── README.md │ ├── iam-role │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── README.md │ ├── s3-bucket │ │ ├── outputs.tf │ │ ├── main.tf │ │ ├── variables.tf │ │ └── README.md │ ├── glue-job │ │ ├── 2.0 │ │ │ ├── outputs.tf │ │ │ ├── main.tf │ │ │ ├── README.md │ │ │ └── variables.tf │ │ └── python_shell │ │ │ ├── outputs.tf │ │ │ ├── main.tf │ │ │ ├── README.md │ │ │ └── variables.tf │ └── iam-policy │ │ ├── main.tf │ │ ├── variables.tf │ │ └── README.md └── solution │ ├── terraform.tfvars │ ├── variables.tf │ ├── provider.tf │ ├── README.md │ ├── glue_workflow_simple.tf │ ├── s3.tf │ ├── .terraform.lock.hcl │ ├── iam.tf │ ├── glue_workflow_complex.tf │ ├── glue_jobs.tf │ └── glue_jobs_dummy.tf ├── .env.example ├── Dockerfile ├── docker-compose.yml ├── Makefile ├── LICENSE ├── glue-jobs.sh ├── .gitignore ├── .pre-commit-config.yaml ├── README.md └── dummy_data └── cereal.csv /.terraform-version: -------------------------------------------------------------------------------- 1 | 0.14.3 2 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/conftest.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /glue/data_sources/dummy_job/dummy_transition/requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | moto==1.3.14 2 | -------------------------------------------------------------------------------- /_images/job_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_1.png -------------------------------------------------------------------------------- /_images/job_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_2.png -------------------------------------------------------------------------------- /_images/job_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_3.png -------------------------------------------------------------------------------- /arch_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/arch_diagram.png -------------------------------------------------------------------------------- /glue/data_sources/ds1/refined_to_curated/requirements.txt: -------------------------------------------------------------------------------- 1 | file:../../../shared/glue_shared_lib#egg=glue-shared 2 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/raw_to_refined/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.0.3 2 | pyarrow==2.0.0 3 | s3fs==0.4.2 4 | file:../../../shared/glue_shared_lib#egg=glue-shared 5 | -------------------------------------------------------------------------------- /terraform/modules/glue-workflow/outputs.tf: -------------------------------------------------------------------------------- 1 | output "workflow_name" { 2 | description="AWS Glue Workflow Name" 3 | value = aws_glue_workflow.glue_workflow.name 4 | } 5 | -------------------------------------------------------------------------------- /terraform/modules/iam-role/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "role" { 2 | name = var.iam_role_name 3 | assume_role_policy = var.assume_role_policy 4 | tags = var.tags 5 | } 6 | -------------------------------------------------------------------------------- /terraform/modules/s3-bucket/outputs.tf: -------------------------------------------------------------------------------- 1 | output "id" { 2 | description = "S3 bucket ID" 3 | value = aws_s3_bucket.s3.id 4 | } 5 | 6 | output "arn" { 7 | description = "S3 bucket ARN" 8 | value = aws_s3_bucket.s3.arn 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/2.0/outputs.tf: -------------------------------------------------------------------------------- 1 | output "job_name" { 2 | description="AWS Glue Job Name" 3 | value = aws_glue_job.job.name 4 | } 5 | 6 | output "job_arn" { 7 | description="AWS Glue Job ARN" 8 | value = aws_glue_job.job.arn 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/python_shell/outputs.tf: -------------------------------------------------------------------------------- 1 | output "job_name" { 2 | description="AWS Glue Job Name" 3 | value = aws_glue_job.job.name 4 | } 5 | 6 | output "job_arn" { 7 | description="AWS Glue Job ARN" 8 | value = aws_glue_job.job.arn 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/iam-role/outputs.tf: -------------------------------------------------------------------------------- 1 | output "iam_role_id" { 2 | description = "IAM role id" 3 | value = aws_iam_role.role.id 4 | } 5 | 6 | output "iam_role_arn" { 7 | description = "IAM role ARN" 8 | value = aws_iam_role.role.arn 9 | } 10 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | TF_VAR_glue_bucket_name="bucket-where-glue-items-are-stored" 2 | TF_STATE_BUCKET="your-tf-state-bucket-name" 3 | AWS_REGION=us-east-1 4 | AWS_DEFAULT_REGION=us-east-1 5 | AWS_SECRET_ACCESS_KEY= 6 | AWS_ACCESS_KEY_ID= 7 | -------------------------------------------------------------------------------- /glue/data_sources/dummy_job/dummy_transition/dummy_transition.py: -------------------------------------------------------------------------------- 1 | """ 2 | A dummy transition for workflow simulation 3 | """ 4 | 5 | import datetime 6 | 7 | 8 | def main(): 9 | print(f"Dummy job runs at: {datetime.datetime.now()}") 10 | 11 | 12 | if __name__ == "__main__": 13 | main() 14 | -------------------------------------------------------------------------------- /terraform/modules/iam-policy/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_policy" "policy" { 2 | name = var.iam_role_policy_name 3 | policy = var.iam_role_policy 4 | } 5 | 6 | resource "aws_iam_policy_attachment" "attach_policy" { 7 | name = var.attachment_name 8 | roles = var.roles 9 | policy_arn = aws_iam_policy.policy.arn 10 | } 11 | -------------------------------------------------------------------------------- /terraform/solution/terraform.tfvars: -------------------------------------------------------------------------------- 1 | 2 | ######################################## 3 | # Account metadata 4 | ######################################## 5 | 6 | #assume_role_name = "terraform-user" 7 | #infra_provisioner = "terraform" 8 | region = "us-east-1" 9 | 10 | tags = { 11 | "terraform" = "true" 12 | } 13 | 14 | #glue_role = "glue-role" 15 | -------------------------------------------------------------------------------- /terraform/solution/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = string 3 | description = "AWS region name" 4 | default = "us-east-1" 5 | } 6 | variable "tags" { 7 | type = map(string) 8 | description = "AWS resource tags" 9 | default = {} 10 | } 11 | 12 | variable "glue_bucket_name" { 13 | type = string 14 | description = "S3 bucket name where glue jobs are stored" 15 | } 16 | -------------------------------------------------------------------------------- /terraform/solution/provider.tf: -------------------------------------------------------------------------------- 1 | ######################################## 2 | # Provider 3 | ######################################## 4 | terraform { 5 | required_version = "0.14.3" 6 | backend "s3" { 7 | key = "" 8 | } 9 | required_providers { 10 | aws = { 11 | source = "hashicorp/aws" 12 | version = "~> 3.22.0" 13 | } 14 | } 15 | } 16 | 17 | provider "aws" { 18 | region = var.region 19 | } 20 | -------------------------------------------------------------------------------- /terraform/modules/glue-workflow/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_glue_workflow" "glue_workflow" { 2 | name = var.workflow_name 3 | } 4 | 5 | resource "aws_glue_security_configuration" "glue_security" { 6 | name = var.security_name 7 | 8 | encryption_configuration { 9 | cloudwatch_encryption { 10 | } 11 | 12 | job_bookmarks_encryption { 13 | } 14 | 15 | s3_encryption { 16 | s3_encryption_mode = "SSE-S3" 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /terraform/modules/glue-workflow/variables.tf: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | ## Module variables 3 | ######################################################## 4 | 5 | variable "workflow_name" { 6 | type = string 7 | description = "The name you assign to this workflow." 8 | } 9 | 10 | 11 | variable "security_name" { 12 | type = string 13 | description = "Name of the security configuration." 14 | } 15 | -------------------------------------------------------------------------------- /terraform/modules/s3-bucket/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "s3" { 2 | bucket = var.bucket_name 3 | force_destroy = true 4 | lifecycle { 5 | prevent_destroy = false 6 | } 7 | versioning { 8 | enabled = var.versioning_enabled 9 | } 10 | tags = var.tags 11 | 12 | 13 | server_side_encryption_configuration { 14 | rule { 15 | apply_server_side_encryption_by_default { 16 | sse_algorithm = "AES256" 17 | } 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /terraform/modules/s3-bucket/variables.tf: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | ## Module variables 3 | ######################################################## 4 | 5 | variable "bucket_name" { 6 | type = string 7 | description = "The name of the bucket" 8 | } 9 | 10 | variable "tags" { 11 | type = map(any) 12 | description = "Tags associated with the bucket" 13 | } 14 | 15 | variable "versioning_enabled" { 16 | type = string 17 | description = "Enable versioning" 18 | } 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amazon/aws-glue-libs:glue_libs_1.0.0_image_01 2 | 3 | ARG TERRAFORM_VERSION="0.14.3" 4 | 5 | RUN curl https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip > terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \ 6 | unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip -d /bin && \ 7 | rm -f terraform_${TERRAFORM_VERSION}_linux_amd64.zip 8 | 9 | RUN pip install -U pip && \ 10 | pip install -U wheel && \ 11 | pip install -U setuptools && \ 12 | pip install -U awscli boto3 13 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions specific to glue_shared library.""" 2 | 3 | 4 | class GlueJobError(Exception): 5 | """Base glue error.""" 6 | 7 | 8 | class ParametersNotFound(GlueJobError): 9 | """SSM parameters not found.""" 10 | 11 | 12 | class DataNotAvailable(GlueJobError): 13 | """Data not available.""" 14 | 15 | 16 | class JobFailedError(GlueJobError): 17 | """It looks like SystemExit is caught by glue, hence this is needed.""" 18 | 19 | 20 | class IllegalArgumentError(ValueError): 21 | """Illegal arguments supplied.""" 22 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | def test_chunked(): 2 | from glue_shared.helpers import chunked 3 | 4 | l1 = [x for x in range(10)] 5 | l2 = [x for x in range(12)] 6 | 7 | assert tuple(chunked(l1, 3)) == ([0, 1, 2], [3, 4, 5], [6, 7, 8], [9]) 8 | assert tuple(chunked(l1, 5)) == ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]) 9 | assert tuple(chunked(l1, 10)) == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],) 10 | assert tuple(chunked(l2, 3)) == ([0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]) 11 | assert tuple(chunked(l2, 10)) == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11]) 12 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | glue: 5 | container_name: glue 6 | build: 7 | dockerfile: "./Dockerfile" 8 | context: "./" 9 | command: "tail -f /dev/null" 10 | working_dir: "/project" 11 | volumes: 12 | - "./:/project" 13 | environment: 14 | TF_VAR_glue_bucket_name: ${TF_VAR_glue_bucket_name} 15 | TF_STATE_BUCKET: ${TF_STATE_BUCKET} 16 | AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} 17 | AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} 18 | AWS_REGION: ${AWS_REGION} 19 | AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION} 20 | TERRAFORM_VERSION: 0.14.3 21 | -------------------------------------------------------------------------------- /terraform/modules/iam-role/variables.tf: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | ## Module variables 3 | ######################################################## 4 | 5 | variable "iam_role_name" { 6 | type = string 7 | description = "The name of the role. If omitted, Terraform will assign a random, unique name." 8 | } 9 | 10 | variable "assume_role_policy" { 11 | type = string 12 | description = "The policy that grants an entity permission to assume the role." 13 | } 14 | 15 | variable "tags" { 16 | type = map(any) 17 | description = "Key-value mapping of tags for the IAM role" 18 | } 19 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/2.0/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_glue_job" "job" { 2 | name = var.name 3 | connections = var.connections 4 | 5 | number_of_workers = var.number_of_workers 6 | worker_type = var.worker_type 7 | 8 | max_retries = var.max_retries 9 | 10 | glue_version = "2.0" 11 | 12 | command { 13 | name = "glueetl" 14 | script_location = var.script_location 15 | } 16 | 17 | default_arguments = var.default_arguments 18 | description = var.description 19 | execution_property { 20 | max_concurrent_runs = var.max_concurrent_runs 21 | } 22 | 23 | role_arn = var.role_arn 24 | tags = var.tags 25 | } 26 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/python_shell/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_glue_job" "job" { 2 | name = var.name 3 | connections = var.connections 4 | 5 | max_capacity = var.max_capacity 6 | 7 | max_retries = var.max_retries 8 | 9 | glue_version = "1.0" 10 | 11 | command { 12 | name = "pythonshell" 13 | script_location = var.script_location 14 | python_version = var.python_version 15 | } 16 | 17 | default_arguments = var.default_arguments 18 | description = var.description 19 | execution_property { 20 | max_concurrent_runs = var.max_concurrent_runs 21 | } 22 | 23 | role_arn = var.role_arn 24 | tags = var.tags 25 | } 26 | -------------------------------------------------------------------------------- /terraform/modules/glue-workflow/README.md: -------------------------------------------------------------------------------- 1 | # Generated docs 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | security\_name | Name of the security configuration. | `string` | n/a | yes | 19 | | workflow\_name | The name you assign to this workflow. | `string` | n/a | yes | 20 | 21 | ## Outputs 22 | 23 | | Name | Description | 24 | |------|-------------| 25 | | workflow\_name | AWS Glue Workflow Name | 26 | 27 | 28 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/test_glue_functions.py: -------------------------------------------------------------------------------- 1 | def test_get_glue_args(): 2 | from types import ModuleType 3 | import sys 4 | 5 | sys_argv = ["--APP_SETTINGS_ENVIRONMENT", "dev", "--JOB_NAME", "job"] 6 | expected = {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_NAME": "job"} 7 | 8 | def getResolvedOptions(args, options): 9 | """Fake version of awsglue.utils.getResolvedOptions.""" 10 | return {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_NAME": "job"} 11 | 12 | sys.modules["awsglue.utils"] = ModuleType("awsglue.utils") 13 | sys.modules["awsglue.utils"].getResolvedOptions = getResolvedOptions 14 | 15 | from glue_shared import get_glue_args 16 | 17 | assert get_glue_args(sys_argv, ["APP_SETTINGS_ENVIRONMENT"]) == expected 18 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/str2obj.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | from typing import Tuple 4 | 5 | import dateutil.parser 6 | 7 | LOGGER = logging.getLogger(__name__) 8 | 9 | 10 | def str2bool(value): 11 | return value.lower() == "true" 12 | 13 | 14 | def comma_str_time_2_time_obj(comma_str: str) -> Tuple[datetime.datetime, ...]: 15 | """ 16 | Convert comma separated time strings into a list of datetime objects. 17 | 18 | Parameters 19 | ---------- 20 | comma_str 21 | Comma separated times: 2020-04-20 16:00:00, 2020-04-20 15:00:00 22 | 23 | Returns 24 | ------- 25 | A list of datetime objects. 26 | """ 27 | 28 | return tuple(dateutil.parser.parse(time_str) for time_str in comma_str.split(",")) 29 | -------------------------------------------------------------------------------- /terraform/solution/README.md: -------------------------------------------------------------------------------- 1 | # Generated docs 2 | 3 | 4 | ## Requirements 5 | 6 | | Name | Version | 7 | |------|---------| 8 | | terraform | 0.14.3 | 9 | | aws | ~> 3.22.0 | 10 | 11 | ## Providers 12 | 13 | | Name | Version | 14 | |------|---------| 15 | | aws | ~> 3.22.0 | 16 | 17 | ## Inputs 18 | 19 | | Name | Description | Type | Default | Required | 20 | |------|-------------|------|---------|:--------:| 21 | | glue\_bucket\_name | S3 bucket name where glue jobs are stored | `string` | n/a | yes | 22 | | region | AWS region name | `string` | `"us-east-1"` | no | 23 | | tags | AWS resource tags | `map(string)` | `{}` | no | 24 | 25 | ## Outputs 26 | 27 | No output. 28 | 29 | 30 | -------------------------------------------------------------------------------- /terraform/modules/s3-bucket/README.md: -------------------------------------------------------------------------------- 1 | # Generated docs 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | bucket\_name | The name of the bucket | `string` | n/a | yes | 19 | | tags | Tags associated with the bucket | `map(any)` | n/a | yes | 20 | | versioning\_enabled | Enable versioning | `string` | n/a | yes | 21 | 22 | ## Outputs 23 | 24 | | Name | Description | 25 | |------|-------------| 26 | | arn | S3 bucket ARN | 27 | | id | S3 bucket ID | 28 | 29 | 30 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/refined_to_curated/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Config file for iot refined to curated Glue job. 3 | """ 4 | import logging.config 5 | import sys 6 | 7 | from glue_shared import parse_args 8 | from glue_shared.defaults import default_logging_config 9 | 10 | region = "us-east-1" 11 | arguments = parse_args(sys.argv, ["APP_SETTINGS_ENVIRONMENT", "LOG_LEVEL", "S3_BUCKET"]) 12 | 13 | LOGGING_CONFIG = default_logging_config(arguments["LOG_LEVEL"]) 14 | logging.config.dictConfig(LOGGING_CONFIG) 15 | 16 | JOB_CONFIG = dict(arguments) 17 | # must be hard-coded because glue does not provide this in PyShell jobs 18 | JOB_CONFIG["JOB_NAME"] = JOB_CONFIG.get("JOB_NAME") or "refined-to-curated" 19 | JOB_CONFIG["JOB_ID"] = JOB_CONFIG.get("JOB_ID") 20 | JOB_CONFIG["JOB_RUN_ID"] = JOB_CONFIG.get("JOB_RUN_ID") 21 | 22 | JOB_CONFIG["s3_prefix"] = "ds1/refined" 23 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="glue_shared", 5 | author="Jan Gazda", 6 | author_email="jan.gazda@cloudreach.com", 7 | python_requires=">=3.6", 8 | classifiers=[ 9 | "Development Status :: 2 - Pre-Alpha", 10 | "Intended Audience :: Developers", 11 | "Natural Language :: English", 12 | "Programming Language :: Python :: 3.6", 13 | "Programming Language :: Python :: 3.7", 14 | "Programming Language :: Python :: 3.8", 15 | ], 16 | description="Helper library for AWS Glue jobs.", 17 | setup_requires=["wheel"], 18 | package_dir={"": "src"}, 19 | packages=find_packages(where="src", exclude=["contrib", "docs", "tests"]), 20 | test_suite="tests", 21 | version="0.0.1", 22 | zip_safe=True, 23 | ) 24 | -------------------------------------------------------------------------------- /terraform/modules/iam-role/README.md: -------------------------------------------------------------------------------- 1 | # Generated docs 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | assume\_role\_policy | The policy that grants an entity permission to assume the role. | `string` | n/a | yes | 19 | | iam\_role\_name | The name of the role. If omitted, Terraform will assign a random, unique name. | `string` | n/a | yes | 20 | | tags | Key-value mapping of tags for the IAM role | `map(any)` | n/a | yes | 21 | 22 | ## Outputs 23 | 24 | | Name | Description | 25 | |------|-------------| 26 | | iam\_role\_arn | IAM role ARN | 27 | | iam\_role\_id | IAM role id | 28 | 29 | 30 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/__init__.py: -------------------------------------------------------------------------------- 1 | """Collection of convenient functions shared among the glue jobs.""" 2 | 3 | import logging 4 | 5 | from glue_shared.argument_handlers import parse_args 6 | from glue_shared.boto3_helpers import ( 7 | resolve_ssm_parameters, 8 | get_connection, 9 | gracefully_exit, 10 | ) 11 | from glue_shared.glue_interface import ( 12 | get_glue_args, 13 | get_spark_session_and_glue_job, 14 | commit_job, 15 | ) 16 | from glue_shared.str2obj import str2bool, comma_str_time_2_time_obj 17 | 18 | LOGGER = logging.getLogger("glue_shared") 19 | LOGGER.addHandler(logging.NullHandler()) 20 | 21 | __all__ = [ 22 | "parse_args", 23 | "resolve_ssm_parameters", 24 | "get_connection", 25 | "gracefully_exit", 26 | "get_glue_args", 27 | "get_spark_session_and_glue_job", 28 | "commit_job", 29 | "str2bool", 30 | "comma_str_time_2_time_obj", 31 | ] 32 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/spark_helpers.py: -------------------------------------------------------------------------------- 1 | """Useful functions to simplify Spark functionality.""" 2 | import logging 3 | 4 | LOGGER = logging.getLogger(__name__) 5 | 6 | 7 | def read_parquet(spark, path): 8 | """Read the data from a raw zone bucket.""" 9 | LOGGER.info("Reading parquet data from %s", path) 10 | df = spark.read.parquet(path) 11 | LOGGER.debug("DF: %s", show_spark_df(df, 10)) 12 | return df 13 | 14 | 15 | def show_spark_df(df, n=20, truncate=True, vertical=False): 16 | """ 17 | Show DataFrame as str, useful for logging. 18 | 19 | Notes 20 | ----- 21 | Reimplemented from: 22 | https://spark.apache.org/docs/2.4.5/api/python/_modules/pyspark/sql/dataframe.html#DataFrame.show 23 | """ 24 | if isinstance(truncate, bool) and truncate: 25 | return df._jdf.showString(n, 20, vertical) 26 | else: 27 | return df._jdf.showString(n, int(truncate), vertical) 28 | -------------------------------------------------------------------------------- /terraform/modules/iam-policy/variables.tf: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | ## Module variables 3 | ######################################################## 4 | 5 | variable "iam_role_policy_name" { 6 | type = string 7 | description = "The name of the policy. If omitted, Terraform will assign a random, unique name." 8 | } 9 | 10 | variable "iam_role_policy" { 11 | type = string 12 | description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform" 13 | } 14 | 15 | variable "attachment_name" { 16 | type = string 17 | description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform" 18 | } 19 | 20 | variable "roles" { 21 | type = list(any) 22 | description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform" 23 | } 24 | -------------------------------------------------------------------------------- /terraform/solution/glue_workflow_simple.tf: -------------------------------------------------------------------------------- 1 | module "glue_workflow_simple" { 2 | source = "../modules/glue-workflow" 3 | workflow_name = "etl-workflow-simple" 4 | security_name = "glueSecurityConfigSimple" 5 | } 6 | 7 | ###################### Glue Triggers and DAG ######################################## 8 | 9 | resource "aws_glue_trigger" "start_raw_to_refined" { 10 | name = "start_raw_to_refined" 11 | type = "ON_DEMAND" 12 | workflow_name = module.glue_workflow_simple.workflow_name 13 | actions { 14 | job_name = module.ds1_raw_to_refined_job.job_name 15 | } 16 | } 17 | 18 | resource "aws_glue_trigger" "run_refined_to_curated" { 19 | name = "run_refined_to_curated" 20 | type = "CONDITIONAL" 21 | workflow_name = module.glue_workflow_simple.workflow_name 22 | actions { 23 | job_name = module.ds1_refined_to_curated_job.job_name 24 | } 25 | 26 | predicate { 27 | conditions { 28 | job_name = module.ds1_raw_to_refined_job.job_name 29 | state = "SUCCEEDED" 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/raw_to_refined/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration file for ds1-raw-to-refined Python Shell Glue job. 3 | """ 4 | 5 | import logging.config 6 | import sys 7 | 8 | from glue_shared import parse_args 9 | from glue_shared.defaults import default_logging_config 10 | 11 | arguments = parse_args(sys.argv, ["APP_SETTINGS_ENVIRONMENT", "LOG_LEVEL", "S3_BUCKET"]) 12 | 13 | LOGGING_CONFIG = default_logging_config(arguments["LOG_LEVEL"]) 14 | logging.config.dictConfig(LOGGING_CONFIG) 15 | 16 | JOB_CONFIG = dict(arguments) 17 | # must be hard-coded because glue does not provide this in PyShell jobs 18 | JOB_CONFIG["JOB_NAME"] = JOB_CONFIG.get("JOB_NAME") or "ds1-raw-to-refined" 19 | JOB_CONFIG["JOB_ID"] = JOB_CONFIG.get("JOB_ID") 20 | JOB_CONFIG["JOB_RUN_ID"] = JOB_CONFIG.get("JOB_RUN_ID") 21 | 22 | JOB_CONFIG["WORKFLOW_NAME"] = JOB_CONFIG.get("WORKFLOW_NAME") 23 | JOB_CONFIG["WORKFLOW_RUN_ID"] = JOB_CONFIG.get("WORKFLOW_RUN_ID") 24 | 25 | # raw data 26 | 27 | JOB_CONFIG["s3_raw_prefix"] = "ds1/raw" 28 | JOB_CONFIG["s3_refined_prefix"] = "ds1/refined" 29 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/raw_to_refined/raw_to_refined.py: -------------------------------------------------------------------------------- 1 | """ 2 | AWS GLUE PyShell Job to process RAW data. 3 | From Raw zone to Refined zone. 4 | """ 5 | import logging 6 | import pandas as pd 7 | from glue_shared.pandas_helpers import write_parquet 8 | 9 | LOGGER = logging.getLogger("job") 10 | 11 | 12 | def main(): 13 | LOGGER.info("JOB_NAME: %s", JOB_CONFIG["JOB_NAME"]) 14 | LOGGER.info("JOB_ID: %s", JOB_CONFIG["JOB_ID"]) 15 | LOGGER.info("JOB_RUN_ID %s", JOB_CONFIG["JOB_RUN_ID"]) 16 | 17 | LOGGER.info("WORKFLOW_NAME: %s", JOB_CONFIG["WORKFLOW_NAME"]) 18 | LOGGER.info("WORKFLOW_RUN_ID %s", JOB_CONFIG["WORKFLOW_RUN_ID"]) 19 | data_src = f"s3://{JOB_CONFIG['S3_BUCKET']}/{JOB_CONFIG['s3_raw_prefix']}/cereal.csv" 20 | LOGGER.info("Reading raw data from %s", data_src) 21 | df = pd.read_csv(data_src, sep=";") 22 | LOGGER.info("DF shape %s", df.shape) 23 | write_parquet(df, f"s3://{JOB_CONFIG['S3_BUCKET']}/{JOB_CONFIG['s3_refined_prefix']}") 24 | 25 | 26 | if __name__ == "__main__": 27 | from config import JOB_CONFIG 28 | 29 | main() 30 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/refined_to_curated/refined_to_curated.py: -------------------------------------------------------------------------------- 1 | """ 2 | AWS GLUE PySpark Job to process REFINED data. 3 | From Refined zone to Curated zone. 4 | """ 5 | import datetime 6 | import logging 7 | 8 | import pyspark 9 | from glue_shared import get_spark_session_and_glue_job 10 | from glue_shared.spark_helpers import read_parquet 11 | 12 | LOGGER = logging.getLogger("job") 13 | 14 | 15 | def run_etl(cfg, spark: pyspark.sql.SQLContext): 16 | df = read_parquet(spark, f"s3://{cfg['S3_BUCKET']}/{cfg['s3_prefix']}") 17 | LOGGER.debug("Count in: %s", df.count()) 18 | LOGGER.debug("Here we can continue processing data and write them to the curated zone.") 19 | 20 | 21 | def main(): 22 | spark, job = get_spark_session_and_glue_job(JOB_CONFIG) 23 | LOGGER.debug("Spark job started at: %s", datetime.datetime.utcnow().isoformat()) 24 | 25 | run_etl(JOB_CONFIG, spark) 26 | 27 | LOGGER.debug("Spark job finished at: %s", datetime.datetime.utcnow().isoformat()) 28 | 29 | 30 | if __name__ == "__main__": 31 | from config import JOB_CONFIG 32 | 33 | main() 34 | -------------------------------------------------------------------------------- /terraform/solution/s3.tf: -------------------------------------------------------------------------------- 1 | ##################### S3 Buckets ####################### 2 | module "s3_bucket_all" { 3 | source = "../modules/s3-bucket" 4 | bucket_name = var.glue_bucket_name 5 | versioning_enabled = false 6 | tags = var.tags 7 | 8 | } 9 | 10 | resource "aws_s3_bucket_object" "ds1_raw_folder" { 11 | key = "/ds1/raw/" 12 | bucket = module.s3_bucket_all.id 13 | server_side_encryption = "AES256" 14 | } 15 | 16 | resource "aws_s3_bucket_object" "raw_data_file" { 17 | bucket = module.s3_bucket_all.id 18 | key = "/ds1/raw/cereal.csv" 19 | source = "../../dummy_data/cereal.csv" 20 | } 21 | 22 | resource "aws_s3_bucket_object" "ds1_refined_folder" { 23 | key = "/ds1/refined/yes" 24 | bucket = module.s3_bucket_all.id 25 | server_side_encryption = "AES256" 26 | } 27 | 28 | resource "aws_s3_bucket_object" "code_folder" { 29 | key = "/code/" 30 | bucket = module.s3_bucket_all.id 31 | server_side_encryption = "AES256" 32 | } 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BASEDIR=$(CURDIR) 2 | TF_DIR=$(BASEDIR)/terraform/solution 3 | 4 | # Check if deploy environment is set! 5 | variables := TF_STATE_BUCKET 6 | 7 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined)) 8 | $(foreach 1,$(variables),$(fatal_if_undefined)) 9 | 10 | 11 | .PHONY: help 12 | help: ## This help. 13 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 14 | 15 | tf-apply: ## terraform apply 16 | cd $(TF_DIR) && terraform apply -auto-approve 17 | 18 | tf-init: ## terraform init 19 | cd $(TF_DIR) && terraform init -backend-config "bucket=${TF_STATE_BUCKET}" -backend-config "key=tf.state" 20 | 21 | tf-plan: ## terraform plan 22 | cd $(TF_DIR) && terraform plan 23 | 24 | tf-destroy: ## terraform destroy 25 | cd $(TF_DIR) && terraform destroy -force 26 | 27 | jobs-deploy: ## deploy glue jobs 28 | bash glue-jobs.sh deploy 29 | 30 | jobs-package: ## package glue jobs 31 | bash glue-jobs.sh package 32 | 33 | jobs-clean: ## clean glue jobs 34 | bash glue-jobs.sh clean 35 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help 2 | help: ## This help. 3 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 6 | 7 | clean-build: ## Remove build artifacts 8 | rm -fr build/ 9 | rm -fr dist/ 10 | rm -fr .eggs/ 11 | find . -name '*.egg-info' -exec rm -fr {} + 12 | find . -name '*.egg' -exec rm -f {} + 13 | 14 | clean-pyc: ## Remove Python file artifacts 15 | find . -name '*.pyc' -exec rm -f {} + 16 | find . -name '*.pyo' -exec rm -f {} + 17 | find . -name '*~' -exec rm -f {} + 18 | find . -name '__pycache__' -exec rm -fr {} + 19 | 20 | clean-test: ## Remove test and coverage artifacts 21 | rm -fr .tox/ 22 | rm -f .coverage 23 | rm -fr htmlcov/ 24 | rm -fr .pytest_cache 25 | 26 | install: clean ## Install in a current environment 27 | python setup.py install 28 | 29 | install-dev: ## Install in development mode 30 | python setup.py develop 31 | 32 | wheel: clean ## Build wheel 33 | python setup.py bdist_wheel 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jan Gazda 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /terraform/modules/iam-policy/README.md: -------------------------------------------------------------------------------- 1 | # Generated docs 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | attachment\_name | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `string` | n/a | yes | 19 | | iam\_role\_policy | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `string` | n/a | yes | 20 | | iam\_role\_policy\_name | The name of the policy. If omitted, Terraform will assign a random, unique name. | `string` | n/a | yes | 21 | | roles | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `list(any)` | n/a | yes | 22 | 23 | ## Outputs 24 | 25 | No output. 26 | 27 | 28 | -------------------------------------------------------------------------------- /terraform/solution/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "3.22.0" 6 | constraints = "~> 3.22.0" 7 | hashes = [ 8 | "h1:8aWXjFcmEi64P0TMHOCQXWws+/SmvJQrNvHlzdktKOM=", 9 | "h1:f/Tz8zv1Zb78ZaiyJkQ0MGIViZwbYrLuQk3kojPM91c=", 10 | "zh:4a9a66caf1964cdd3b61fb3ebb0da417195a5529cb8e496f266b0778335d11c8", 11 | "zh:514f2f006ae68db715d86781673faf9483292deab235c7402ff306e0e92ea11a", 12 | "zh:5277b61109fddb9011728f6650ef01a639a0590aeffe34ed7de7ba10d0c31803", 13 | "zh:67784dc8c8375ab37103eea1258c3334ee92be6de033c2b37e3a2a65d0005142", 14 | "zh:76d4c8be2ca4a3294fb51fb58de1fe03361d3bc403820270cc8e71a04c5fa806", 15 | "zh:8f90b1cfdcf6e8fb1a9d0382ecaa5056a3a84c94e313fbf9e92c89de271cdede", 16 | "zh:d0ac346519d0df124df89be2d803eb53f373434890f6ee3fb37112802f9eac59", 17 | "zh:d6256feedada82cbfb3b1dd6dd9ad02048f23120ab50e6146a541cb11a108cc1", 18 | "zh:db2fe0d2e77c02e9a74e1ed694aa352295a50283f9a1cf896e5be252af14e9f4", 19 | "zh:eda61e889b579bd90046939a5b40cf5dc9031fb5a819fc3e4667a78bd432bdb2", 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /glue-jobs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | # This script packages orchestrates packaging and uploading of all glue jobs. 4 | # The idea is to run different Make targets to get desired effect. 5 | # While each Glue Job should comply with the structure and provide 6 | # necessary Make targets: 7 | # package - create a deployable package 8 | # test - run tests 9 | # deploy - make the package and upload it to S3 10 | # In the end S3 content will be glue_job.py and glue_job_deps.zip 11 | 12 | ARGUMENT=$1 13 | OPTIONS="package|test|deploy|clean" 14 | 15 | if [[ ${OPTIONS} != *"$ARGUMENT"* ]]; then 16 | echo "Argument must match one of ${OPTIONS}" 17 | exit 1 18 | fi 19 | 20 | echo GLUEING GLUE 21 | # Deploy data source specific jobs 22 | for DIR in glue/data_sources/*/*; do 23 | if [[ -d ${DIR} ]]; then 24 | if [[ -e ${DIR}/Makefile ]]; then 25 | cd ${DIR} 26 | echo --------${DIR}------------- 27 | make ${ARGUMENT} 28 | echo --------------------------- 29 | cd - 30 | fi 31 | fi; 32 | done 33 | 34 | # Deploy general jobs 35 | for DIR in glue/shared/glue_jobs/*; do 36 | if [[ -d ${DIR} ]]; then 37 | if [[ -e ${DIR}/Makefile ]]; then 38 | cd ${DIR} 39 | echo --------${DIR}------------- 40 | make ${ARGUMENT} 41 | echo --------------------------- 42 | cd - 43 | fi 44 | fi; 45 | done 46 | -------------------------------------------------------------------------------- /terraform/solution/iam.tf: -------------------------------------------------------------------------------- 1 | ###################### Glue IAM ######################################## 2 | 3 | module "glue_role" { 4 | source = "../modules/iam-role" 5 | iam_role_name = "glue-role" 6 | assume_role_policy = data.aws_iam_policy_document.glue_assume_role_policy.json 7 | tags = var.tags 8 | } 9 | 10 | data "aws_iam_policy_document" "glue_assume_role_policy" { 11 | statement { 12 | actions = ["sts:AssumeRole"] 13 | 14 | principals { 15 | type = "Service" 16 | identifiers = ["glue.amazonaws.com"] 17 | } 18 | } 19 | } 20 | 21 | ################# Attach AWS Managed Policies ################## 22 | 23 | resource "aws_iam_policy_attachment" "glue_service_role" { 24 | name = "AWSGlueServiceRole" 25 | roles = [module.glue_role.iam_role_id] 26 | policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" 27 | } 28 | 29 | resource "aws_iam_policy_attachment" "s3_full_access" { 30 | name = "AmazonS3FullAccess" 31 | roles = [module.glue_role.iam_role_id] 32 | policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess" 33 | } 34 | 35 | resource "aws_iam_policy_attachment" "cloudwatch_logs_role" { 36 | name = "CloudWatchLogsFullAccess" 37 | roles = [module.glue_role.iam_role_id] 38 | policy_arn = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess" 39 | } 40 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/2.0/README.md: -------------------------------------------------------------------------------- 1 | # Glue PySpark job 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | connections | The list of connections used for this job. | `list(string)` | `[]` | no | 19 | | default\_arguments | The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide. | `map(string)` | `{}` | no | 20 | | description | Description of the job. | `string` | `""` | no | 21 | | max\_concurrent\_runs | The maximum number of concurrent runs allowed for a job. The default is 1. | `string` | `"1"` | no | 22 | | max\_retries | Number of retries | `string` | `null` | no | 23 | | name | Name of the job | `string` | n/a | yes | 24 | | number\_of\_workers | Number of Glue (G.#X) workers | `number` | `null` | no | 25 | | role\_arn | The ARN of the IAM role associated with this job. | `string` | n/a | yes | 26 | | script\_location | Specifies the S3 path to a script that executes a job. | `string` | n/a | yes | 27 | | tags | AWS resource tags | `map(string)` | `{}` | no | 28 | | worker\_type | Worker type | `string` | `"G.1X"` | no | 29 | 30 | ## Outputs 31 | 32 | | Name | Description | 33 | |------|-------------| 34 | | job\_arn | AWS Glue Job ARN | 35 | | job\_name | AWS Glue Job Name | 36 | 37 | 38 | -------------------------------------------------------------------------------- /terraform/solution/glue_workflow_complex.tf: -------------------------------------------------------------------------------- 1 | module "glue_workflow_complex" { 2 | source = "../modules/glue-workflow" 3 | workflow_name = "etl-workflow-complex" 4 | security_name = "glueSecurityConfigComplex" 5 | } 6 | 7 | ###################### Glue Triggers and DAG ######################################## 8 | 9 | locals { 10 | jobs_0_1 = [ 11 | module.dummy_job["dummy_job_0"].job_name, 12 | module.dummy_job["dummy_job_1"].job_name 13 | ] 14 | job_2 = module.dummy_job["dummy_job_2"].job_name 15 | jobs_3_5 = [ 16 | module.dummy_job["dummy_job_3"].job_name, 17 | module.dummy_job["dummy_job_4"].job_name, 18 | module.dummy_job["dummy_job_5"].job_name 19 | ] 20 | } 21 | 22 | # starts 2 jobs 23 | resource "aws_glue_trigger" "start_complex" { 24 | name = "start_complex" 25 | type = "ON_DEMAND" 26 | workflow_name = module.glue_workflow_complex.workflow_name 27 | 28 | dynamic "actions" { 29 | for_each = local.jobs_0_1 30 | content { 31 | job_name = actions.value 32 | } 33 | } 34 | } 35 | 36 | # waits for first 2 jobs to finish 37 | resource "aws_glue_trigger" "complex_stage_2" { 38 | name = "complex_stage_2" 39 | type = "CONDITIONAL" 40 | workflow_name = module.glue_workflow_complex.workflow_name 41 | 42 | actions { 43 | job_name = local.job_2 44 | } 45 | predicate { 46 | dynamic "conditions" { 47 | for_each = local.jobs_0_1 48 | content { 49 | job_name = conditions.value 50 | state = "SUCCEEDED" 51 | } 52 | } 53 | } 54 | } 55 | 56 | resource "aws_glue_trigger" "wait_for_second" { 57 | name = "complex_stage_3" 58 | type = "CONDITIONAL" 59 | workflow_name = module.glue_workflow_complex.workflow_name 60 | 61 | dynamic "actions" { 62 | for_each = local.jobs_3_5 63 | content { 64 | job_name = actions.value 65 | } 66 | } 67 | 68 | predicate { 69 | conditions { 70 | job_name = local.job_2 71 | state = "SUCCEEDED" 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/helpers.py: -------------------------------------------------------------------------------- 1 | """Helper functions used across this library.""" 2 | import os 3 | import re 4 | from functools import partial 5 | from itertools import islice 6 | from typing import Tuple 7 | 8 | EXTENSIONS = re.compile(r".+py$|.+zip$|.+egg$") 9 | 10 | 11 | def take(n, iterable): 12 | """ 13 | Return first n items of the iterable as a list 14 | 15 | Notes 16 | ----- 17 | From itertools recipes: 18 | https://docs.python.org/3.6/library/itertools.html#itertools-recipes 19 | """ 20 | 21 | return list(islice(iterable, n)) 22 | 23 | 24 | def chunked(iterable, n): 25 | """Break *iterable* into lists of length *n*: 26 | 27 | >>> list(chunked([1, 2, 3, 4, 5, 6], 3)) 28 | [[1, 2, 3], [4, 5, 6]] 29 | 30 | If the length of *iterable* is not evenly divisible by *n*, the last 31 | returned list will be shorter: 32 | 33 | >>> list(chunked([1, 2, 3, 4, 5, 6, 7, 8], 3)) 34 | [[1, 2, 3], [4, 5, 6], [7, 8]] 35 | 36 | To use a fill-in value instead, see the :func:`grouper` recipe. 37 | 38 | :func:`chunked` is useful for splitting up a computation on a large number 39 | of keys into batches, to be pickled and sent off to worker processes. One 40 | example is operations on rows in MySQL, which does not implement 41 | server-side cursors properly and would otherwise load the entire dataset 42 | into RAM on the client. 43 | 44 | Notes 45 | ----- 46 | Reimplemented from more itertools to avoid the installation of the package. 47 | https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.chunked 48 | """ 49 | return iter(partial(take, n, iter(iterable)), []) 50 | 51 | 52 | def get_py_zip_egg_files(path: str) -> Tuple[str, ...]: 53 | """ 54 | Find all .py, .zip, .egg files in sys.path. 55 | 56 | This method is a workaround needed for Glue2.0 as of 2020-05-11 57 | """ 58 | 59 | return tuple(e.path for e in filter(lambda ent: EXTENSIONS.match(ent.name), os.scandir(path))) 60 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/python_shell/README.md: -------------------------------------------------------------------------------- 1 | # Glue Python Shell job 2 | 3 | 4 | ## Requirements 5 | 6 | No requirements. 7 | 8 | ## Providers 9 | 10 | | Name | Version | 11 | |------|---------| 12 | | aws | n/a | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | connections | The list of connections used for this job. | `list(string)` | `[]` | no | 19 | | default\_arguments | The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide. | `map(string)` | `{}` | no | 20 | | description | Description of the job. | `string` | `""` | no | 21 | | max\_capacity | The maximum number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. Required when pythonshell is set, accept either 0.0625 or 1.0. | `string` | `"0.0625"` | no | 22 | | max\_concurrent\_runs | The maximum number of concurrent runs allowed for a job. The default is 1. | `string` | `"1"` | no | 23 | | max\_retries | Number of retries | `string` | `null` | no | 24 | | name | Name of the job | `string` | n/a | yes | 25 | | python\_version | The Python version being used to execute a Python shell job. Allowed values are 2 or 3. | `string` | `"3"` | no | 26 | | role\_arn | The ARN of the IAM role associated with this job. | `string` | n/a | yes | 27 | | script\_location | Specifies the S3 path to a script that executes a job. | `string` | n/a | yes | 28 | | tags | AWS resource tags | `map(string)` | `{}` | no | 29 | 30 | ## Outputs 31 | 32 | | Name | Description | 33 | |------|-------------| 34 | | job\_arn | AWS Glue Job ARN | 35 | | job\_name | AWS Glue Job Name | 36 | 37 | 38 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/raw_to_refined/Makefile: -------------------------------------------------------------------------------- 1 | BASEDIR=$(CURDIR) 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) 3 | MK_PARENT:=$(realpath $(MK_PATH)../) 4 | JOB_NAME:=$(notdir $(MK_PARENT)) 5 | TRANSITION_STATE:=$(notdir $(BASEDIR)) 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py 7 | JOB_TRANSITION_ZIP=$(BASEDIR)/$(TRANSITION_STATE).zip 8 | BUILD_DIR=$(BASEDIR)/dist 9 | 10 | variables := TF_VAR_glue_bucket_name 11 | 12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined)) 13 | $(foreach 1,$(variables),$(fatal_if_undefined)) 14 | 15 | .PHONY: help 16 | help: ## This help. 17 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 18 | 19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 20 | 21 | clean-build: ## Remove build artifacts 22 | rm -fr build/ 23 | rm -fr dist/ 24 | rm -fr .eggs/ 25 | find . -name '*.egg-info' -exec rm -fr {} + 26 | find . -name '*.egg' -exec rm -f {} + 27 | 28 | clean-pyc: ## Remove Python file artifacts 29 | find . -name '*.pyc' -exec rm -f {} + 30 | find . -name '*.pyo' -exec rm -f {} + 31 | find . -name '*~' -exec rm -f {} + 32 | find . -name '__pycache__' -exec rm -fr {} + 33 | 34 | clean-test: ## Remove test and coverage artifacts 35 | rm -fr .tox/ 36 | rm -f .coverage 37 | rm -fr htmlcov/ 38 | rm -fr .pytest_cache 39 | 40 | test: ## Run tests 41 | echo Tests are not implemented 42 | 43 | package: ## Build deps package 44 | @echo Packaging 45 | @mkdir -p $(BUILD_DIR) 46 | @pip install wheel 47 | cp config.py $(BUILD_DIR) 48 | pip wheel -w $(BUILD_DIR) -r requirements.txt --no-deps 49 | 50 | upload-job: ## Upload job.py file 51 | @echo Uploading $(JOB_NAME) 52 | aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY) 53 | 54 | upload: upload-job ## Upload artefacts to S3 55 | aws s3 sync --delete $(BUILD_DIR) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies 56 | 57 | deploy: clean package upload ## Package and upload to S3 58 | -------------------------------------------------------------------------------- /glue/data_sources/ds1/refined_to_curated/Makefile: -------------------------------------------------------------------------------- 1 | BASEDIR=$(CURDIR) 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) 3 | MK_PARENT:=$(realpath $(MK_PATH)../) 4 | JOB_NAME:=$(notdir $(MK_PARENT)) 5 | TRANSITION_STATE:=$(notdir $(BASEDIR)) 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py 7 | BUILD_DIR=$(BASEDIR)/dist 8 | JOB_TRANSITION_ZIP=$(BUILD_DIR)/$(TRANSITION_STATE).zip 9 | 10 | variables := TF_VAR_glue_bucket_name 11 | 12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined)) 13 | $(foreach 1,$(variables),$(fatal_if_undefined)) 14 | 15 | .PHONY: help 16 | help: ## This help. 17 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 18 | 19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 20 | 21 | clean-build: ## Remove build artifacts 22 | rm -fr build/ 23 | rm -fr dist/ 24 | rm -fr .eggs/ 25 | find . -name '*.egg-info' -exec rm -fr {} + 26 | find . -name '*.egg' -exec rm -f {} + 27 | 28 | clean-pyc: ## Remove Python file artifacts 29 | find . -name '*.pyc' -exec rm -f {} + 30 | find . -name '*.pyo' -exec rm -f {} + 31 | find . -name '*~' -exec rm -f {} + 32 | find . -name '__pycache__' -exec rm -fr {} + 33 | 34 | clean-test: ## Remove test and coverage artifacts 35 | rm -fr .tox/ 36 | rm -f .coverage 37 | rm -fr htmlcov/ 38 | rm -fr .pytest_cache 39 | 40 | test: ## Run tests 41 | echo Tests are not implemented 42 | 43 | package: ## Build deps package 44 | mkdir -p $(BUILD_DIR) 45 | cp config.py $(BUILD_DIR) 46 | pip install -t $(BUILD_DIR) -r requirements.txt 47 | cd $(BUILD_DIR) && zip -r $(JOB_TRANSITION_ZIP) . 48 | 49 | 50 | upload-job: ## Upload job.py file 51 | @echo Uploading $(JOB_NAME) 52 | aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY) 53 | 54 | upload: upload-job ## Upload artefacts to S3 55 | aws s3 cp $(JOB_TRANSITION_ZIP) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies/ 56 | 57 | deploy: clean package upload ## Package and upload to S3 58 | -------------------------------------------------------------------------------- /glue/data_sources/dummy_job/dummy_transition/Makefile: -------------------------------------------------------------------------------- 1 | BASEDIR=$(CURDIR) 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) 3 | MK_PARENT:=$(realpath $(MK_PATH)../) 4 | JOB_NAME:=$(notdir $(MK_PARENT)) 5 | TRANSITION_STATE:=$(notdir $(BASEDIR)) 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py 7 | JOB_TRANSITION_ZIP=$(BASEDIR)/$(TRANSITION_STATE).zip 8 | BUILD_DIR=$(BASEDIR)/dist 9 | 10 | variables := TF_VAR_glue_bucket_name 11 | 12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined)) 13 | $(foreach 1,$(variables),$(fatal_if_undefined)) 14 | 15 | .PHONY: help 16 | help: ## This help. 17 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 18 | 19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 20 | 21 | clean-build: ## Remove build artifacts 22 | rm -fr build/ 23 | rm -fr dist/ 24 | rm -fr .eggs/ 25 | find . -name '*.egg-info' -exec rm -fr {} + 26 | find . -name '*.egg' -exec rm -f {} + 27 | 28 | clean-pyc: ## Remove Python file artifacts 29 | find . -name '*.pyc' -exec rm -f {} + 30 | find . -name '*.pyo' -exec rm -f {} + 31 | find . -name '*~' -exec rm -f {} + 32 | find . -name '__pycache__' -exec rm -fr {} + 33 | 34 | clean-test: ## Remove test and coverage artifacts 35 | rm -fr .tox/ 36 | rm -f .coverage 37 | rm -fr htmlcov/ 38 | rm -fr .pytest_cache 39 | 40 | test: ## Run tests 41 | echo Tests are not implemented 42 | 43 | package: ## Build deps package 44 | @echo Packaging 45 | @mkdir -p $(BUILD_DIR) 46 | @pip install wheel 47 | cp config.py $(BUILD_DIR) 48 | pip wheel -w $(BUILD_DIR) -r requirements.txt --no-deps 49 | 50 | upload-job: ## Upload job.py file 51 | @echo Uploading $(JOB_NAME) 52 | aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY) 53 | 54 | #upload: upload-job ## Upload artefacts to S3 55 | # aws s3 sync --delete $(BUILD_DIR) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies 56 | 57 | deploy: clean upload-job ## Package and upload to S3 58 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/pandas_helpers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | import pandas as pd 5 | from glue_shared.boto3_helpers import get_s3_keys 6 | 7 | LOGGER = logging.getLogger(__name__) 8 | 9 | 10 | def write_parquet( 11 | df: pd.DataFrame, 12 | s3_folder_url: str, 13 | partition_cols: List[str] = None, 14 | compression: str = None, 15 | ): 16 | """ 17 | Write Parquet file to S3 folder. 18 | 19 | Parameters 20 | ---------- 21 | df 22 | Pandas dataframe 23 | s3_folder_url 24 | S3 url: s3:///. 25 | partition_cols 26 | Partition path by columns 27 | compression 28 | Parquet compression. Default is "snappy" 29 | 30 | """ 31 | 32 | import pyarrow as pa 33 | import pyarrow.parquet as pq 34 | import s3fs 35 | 36 | LOGGER.info("Writing parquet file to S3: %s", f"{s3_folder_url}") 37 | table = pa.Table.from_pandas(df, preserve_index=False) 38 | 39 | pq.write_to_dataset( 40 | table, 41 | s3_folder_url, 42 | filesystem=s3fs.S3FileSystem(), 43 | partition_cols=partition_cols, 44 | compression=compression or "snappy", 45 | ) 46 | 47 | 48 | def df_from_s3_json( 49 | s3_client, 50 | bucket_name: str, 51 | prefix: str, 52 | compression: str = None, 53 | lines: bool = True, 54 | ): 55 | """ 56 | Create Pandas DataFrame from multiple files in S3 prefix. 57 | 58 | Parameters 59 | ---------- 60 | s3_client 61 | boto3.client('s3') 62 | bucket_name 63 | prefix 64 | compression 65 | Json file compression. 66 | lines 67 | Multiple JSON objects per line. 68 | 69 | Returns 70 | ------- 71 | pd.DataFrame 72 | Dataframe containing data under S3 prefix. 73 | 74 | """ 75 | 76 | df_merged = pd.DataFrame() 77 | 78 | for key in get_s3_keys(s3_client, bucket_name, prefix): 79 | resp = s3_client.get_object(Bucket=bucket_name, Key=key) 80 | df = pd.read_json(resp["Body"], orient="records", lines=lines, compression=compression) 81 | df_merged = df_merged.append(df, ignore_index=True) 82 | 83 | return df_merged 84 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/2.0/variables.tf: -------------------------------------------------------------------------------- 1 | variable "script_location" { 2 | type = string 3 | description = "Specifies the S3 path to a script that executes a job." 4 | } 5 | 6 | 7 | variable "connections" { 8 | type = list(string) 9 | description = "The list of connections used for this job." 10 | default = [] 11 | } 12 | 13 | variable "default_arguments" { 14 | type = map(string) 15 | description = "The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide." 16 | default = {} 17 | } 18 | 19 | variable "description" { 20 | type = string 21 | description = "Description of the job." 22 | default = "" 23 | } 24 | 25 | variable "max_concurrent_runs" { 26 | type = string 27 | description = "The maximum number of concurrent runs allowed for a job. The default is 1." 28 | default = "1" 29 | } 30 | 31 | 32 | variable "name" { 33 | type = string 34 | description = "Name of the job" 35 | } 36 | 37 | variable "role_arn" { 38 | type = string 39 | description = "The ARN of the IAM role associated with this job." 40 | 41 | } 42 | 43 | variable "tags" { 44 | type = map(string) 45 | description = "AWS resource tags" 46 | default = {} 47 | } 48 | 49 | variable "number_of_workers" { 50 | type = number 51 | description = "Number of Glue (G.#X) workers" 52 | default = null 53 | } 54 | 55 | variable "worker_type" { 56 | description = "Worker type" 57 | type = string 58 | default = "G.1X" 59 | validation { 60 | condition = contains(["G.1X", "G.2X"], var.worker_type) 61 | error_message = "Worker type can be one of 'G.1X', 'G.2X'." 62 | } 63 | } 64 | 65 | variable "max_retries" { 66 | description = "Number of retries" 67 | type = string 68 | default = null 69 | } 70 | -------------------------------------------------------------------------------- /terraform/solution/glue_jobs.tf: -------------------------------------------------------------------------------- 1 | ###################### Glue Jobs ######################################## 2 | 3 | module "ds1_raw_to_refined_job" { 4 | source = "../modules/glue-job/python_shell" 5 | 6 | name = "ds1-raw-to-refined" 7 | role_arn = module.glue_role.iam_role_arn 8 | script_location = "s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/raw_to_refined.py" 9 | default_arguments = { 10 | "--extra-py-files" = < Dict: 19 | """ 20 | Argument parser fallback for AWS Glue jobs. 21 | 22 | This Fallback function is necessary due to lack of API uniformity 23 | between PySpark and PyShell jobs. 24 | 25 | Parameters 26 | ---------- 27 | arguments 28 | Sequence of options and values to be parsed. (sys.argv) 29 | options 30 | Options which value is resolved. 31 | 32 | Returns 33 | ------- 34 | Parsed options and values. 35 | 36 | """ 37 | LOGGER.debug("Parsing arguments with fallback function.") 38 | LOGGER.debug("Parsing arguments: %s options: %s", arguments, options) 39 | parser = CustomArgumentParser() 40 | if not options: 41 | options = [] 42 | for opt in options: 43 | parser.add_argument(f"--{opt}", required=True) 44 | 45 | args = vars(parser.parse_known_args(arguments[1:])[0]) 46 | return args 47 | 48 | 49 | def parse_args(arguments: Sequence, options: List[str] = None) -> Dict: 50 | """ 51 | Parse input arguments. 52 | 53 | Simple assessment that module AWS Glue is not available in pyshell jobs. 54 | 55 | Parameters 56 | ---------- 57 | arguments 58 | Sequence of options and values to be parsed. (sys.argv) 59 | options 60 | Options which value is resolved. 61 | 62 | Returns 63 | ------- 64 | Parsed options and values. 65 | 66 | """ 67 | LOGGER.debug("Parsing arguments: %s options: %s", arguments, options) 68 | 69 | try: 70 | import awsglue.utils as au 71 | except ImportError: 72 | return parse_args_fallback(arguments, options) 73 | 74 | try: 75 | resolved = au.getResolvedOptions(args=arguments, options=options) 76 | LOGGER.debug("awsglue.utils args resolved: %s", resolved) 77 | return resolved 78 | except au.GlueArgumentError: 79 | return parse_args_fallback(arguments, options) 80 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from types import ModuleType 3 | import datetime 4 | from glue_shared import parse_args 5 | 6 | 7 | def test_parse_args_pyshell(): 8 | argv = [ 9 | "/tmp/glue-python-scripts-jm72zh6c/jan_pyshell_job.py", 10 | "--APP_SETTINGS_ENVIRONMENT", 11 | "dev", 12 | "--job-bookmark-option", 13 | "job-bookmark-disable", 14 | "--job-language", 15 | "python", 16 | ] 17 | 18 | actual = parse_args(argv, ["APP_SETTINGS_ENVIRONMENT"]) 19 | expected = {"APP_SETTINGS_ENVIRONMENT": "dev"} 20 | assert actual == expected 21 | 22 | 23 | def test_parse_args_glueetl(): 24 | argv = [ 25 | "script_2020-04-15-11-21-57.py", 26 | "--JOB_NAME", 27 | "glue-spark-job", 28 | "--APP_SETTINGS_ENVIRONMENT", 29 | "dev", 30 | "--JOB_ID", 31 | "j_3456789", 32 | "--JOB_RUN_ID", 33 | "jr_3456789", 34 | "--job-bookmark-option", 35 | "job-bookmark-disable", 36 | "--TempDir", 37 | "s3://bucker/Key/dir", 38 | ] 39 | 40 | sys.modules["dynamicframe"] = ModuleType("DynamicFrame") 41 | sys.modules["dynamicframe"].DynamicFrame = None 42 | sys.modules["awsglue.utils"] = ModuleType("awsglue.utils") 43 | sys.modules["awsglue.utils"].getResolvedOptions = lambda arguments, options=None: { 44 | "APP_SETTINGS_ENVIRONMENT": "dev", 45 | "JOB_ID": "j_3456789", 46 | } 47 | 48 | actual = parse_args(argv, ["APP_SETTINGS_ENVIRONMENT", "JOB_ID"]) 49 | expected = {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_ID": "j_3456789"} 50 | assert actual == expected 51 | 52 | 53 | def test_comma_str_time_2_time_obj(): 54 | from glue_shared.str2obj import comma_str_time_2_time_obj 55 | 56 | input1 = "2020-04-21 03:00" 57 | input2 = "2020-04-21 03:00, 2020-04-21 04:00" 58 | input3 = "2020-04-21 03:00,2020-04-20 02:00" 59 | 60 | expected1 = (datetime.datetime(2020, 4, 21, 3, 0),) 61 | expected2 = ( 62 | datetime.datetime(2020, 4, 21, 3, 0), 63 | datetime.datetime(2020, 4, 21, 4, 0), 64 | ) 65 | 66 | expected3 = ( 67 | datetime.datetime(2020, 4, 21, 3, 0), 68 | datetime.datetime(2020, 4, 20, 2, 0), 69 | ) 70 | assert comma_str_time_2_time_obj(input1) == expected1 71 | assert comma_str_time_2_time_obj(input2) == expected2 72 | assert comma_str_time_2_time_obj(input3) == expected3 73 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/defaults.py: -------------------------------------------------------------------------------- 1 | """This module intends to provide base configuration for AWS Glue jobs.""" 2 | import logging 3 | from typing import Dict 4 | 5 | 6 | class InfoDebugFilter(logging.Filter): 7 | def filter(self, rec): 8 | """Filter debug and info messages.""" 9 | return rec.levelno in (logging.DEBUG, logging.INFO) 10 | 11 | 12 | def default_logging_config(level: str = "INFO", formatter_name: str = "detailed") -> Dict: 13 | """ 14 | Create default logging config. 15 | 16 | Parameters 17 | ---------- 18 | level 19 | Log level. 20 | formatter_name 21 | Log formatter name. Possible values: detailed or dev. 22 | 23 | Returns 24 | ------- 25 | Dictionary compatible with logging.config.dictConfig. 26 | 27 | """ 28 | logging_config = { 29 | "version": 1, 30 | "filters": {"info_debug_filter": {"()": InfoDebugFilter}}, 31 | "formatters": { 32 | "detailed": { 33 | "class": "logging.Formatter", 34 | "format": "%(asctime)s %(levelname)-8s %(name)-15s - %(message)s", 35 | }, 36 | "dev": { 37 | "class": "logging.Formatter", 38 | "format": "%(asctime)s %(levelname)s %(name)s - ++++++++ %(message)s ++++++++", 39 | }, 40 | }, 41 | "handlers": { 42 | "debug_handler": { 43 | "class": "logging.StreamHandler", 44 | "formatter": formatter_name, 45 | "level": "DEBUG", 46 | "filters": ["info_debug_filter"], 47 | "stream": "ext://sys.stdout", 48 | }, 49 | "warning": { 50 | "class": "logging.StreamHandler", 51 | "formatter": formatter_name, 52 | "level": "WARNING", 53 | "stream": "ext://sys.stdout", 54 | }, 55 | }, 56 | "loggers": { 57 | "job": { 58 | "level": level, 59 | "propagate": False, 60 | "handlers": ["debug_handler", "warning"], 61 | }, 62 | "glue_shared": { 63 | "level": level, 64 | "propagate": False, 65 | "handlers": ["debug_handler", "warning"], 66 | }, 67 | }, 68 | "root": {"level": "WARNING", "handlers": ["debug_handler", "warning"]}, 69 | } 70 | 71 | return logging_config 72 | -------------------------------------------------------------------------------- /terraform/modules/glue-job/python_shell/variables.tf: -------------------------------------------------------------------------------- 1 | variable "script_location" { 2 | type = string 3 | description = "Specifies the S3 path to a script that executes a job." 4 | } 5 | 6 | variable "python_version" { 7 | type = string 8 | description = "The Python version being used to execute a Python shell job. Allowed values are 2 or 3." 9 | default = "3" 10 | validation { 11 | condition = contains(["2", "3"], var.python_version) 12 | error_message = "Python version can be only '2' or '3'." 13 | } 14 | } 15 | 16 | variable "connections" { 17 | type = list(string) 18 | description = "The list of connections used for this job." 19 | default = [] 20 | } 21 | 22 | variable "default_arguments" { 23 | type = map(string) 24 | description = "The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide." 25 | default = {} 26 | } 27 | 28 | variable "description" { 29 | type = string 30 | description = "Description of the job." 31 | default = "" 32 | } 33 | 34 | variable "max_concurrent_runs" { 35 | type = string 36 | description = "The maximum number of concurrent runs allowed for a job. The default is 1." 37 | default = "1" 38 | } 39 | 40 | variable "name" { 41 | type = string 42 | description = "Name of the job" 43 | } 44 | 45 | variable "role_arn" { 46 | type = string 47 | description = "The ARN of the IAM role associated with this job." 48 | 49 | } 50 | 51 | variable "tags" { 52 | type = map(string) 53 | description = "AWS resource tags" 54 | default = {} 55 | } 56 | 57 | variable "max_capacity" { 58 | type = string 59 | description = "The maximum number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. Required when pythonshell is set, accept either 0.0625 or 1.0. " 60 | default = "0.0625" 61 | validation { 62 | condition = contains(["0.0625", "1.0"], var.max_capacity) 63 | error_message = "Max capacity for python job must be a string: '0.625' or '1.0'." 64 | } 65 | } 66 | 67 | variable "max_retries" { 68 | description = "Number of retries" 69 | type = string 70 | default = null 71 | } 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .terraform 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # pytype static type analyzer 137 | .pytype/ 138 | 139 | # Cython debug symbols 140 | cython_debug/ -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/tests/test_boto3_helpers.py: -------------------------------------------------------------------------------- 1 | import moto 2 | import boto3 3 | import pytest 4 | 5 | 6 | @moto.mock_ssm 7 | def test_resolve_2_valid_parameters(): 8 | from glue_shared.boto3_helpers import resolve_ssm_parameters 9 | 10 | ssm_client = boto3.client("ssm") 11 | ssm_client.put_parameter(Name="/dev/db/host", Value="127.0.0.1", Type="String") 12 | ssm_client.put_parameter(Name="/dev/db/password", Value="magic", Type="SecureString") 13 | 14 | actual = resolve_ssm_parameters( 15 | ssm_client, {"db_host": "/dev/db/host", "db_password": "/dev/db/password"} 16 | ) 17 | 18 | expected = {"db_host": "127.0.0.1", "db_password": "magic"} 19 | 20 | assert actual == expected 21 | 22 | 23 | @moto.mock_ssm 24 | def test_resolve_12_valid_parameters(): 25 | from glue_shared.boto3_helpers import resolve_ssm_parameters 26 | from string import ascii_lowercase 27 | from random import choice 28 | 29 | input_ssm_parameters = { 30 | letter: (f"/{letter}/{letter}", "value") for letter in ascii_lowercase[:12] 31 | } 32 | ssm_client = boto3.client("ssm") 33 | param_types = ["String", "SecureString"] 34 | 35 | for key, (name, value) in input_ssm_parameters.items(): 36 | ssm_client.put_parameter(Name=name, Value=value, Type=choice(param_types)) 37 | 38 | actual = resolve_ssm_parameters( 39 | ssm_client, {key: name for key, (name, value) in input_ssm_parameters.items()} 40 | ) 41 | 42 | expected = {key: value for key, (name, value) in input_ssm_parameters.items()} 43 | assert actual == expected 44 | assert len(actual) == len(expected) 45 | 46 | 47 | @moto.mock_ssm 48 | def test_resolve_12_valid_1_invalid_parameters(): 49 | from glue_shared.boto3_helpers import resolve_ssm_parameters 50 | from glue_shared.exceptions import ParametersNotFound 51 | from string import ascii_lowercase 52 | from random import choice 53 | 54 | input_ssm_parameters = { 55 | letter: (f"/{letter}/{letter}", "value") for letter in ascii_lowercase[:12] 56 | } 57 | ssm_client = boto3.client("ssm") 58 | param_types = ["String", "SecureString"] 59 | 60 | for key, (name, value) in input_ssm_parameters.items(): 61 | ssm_client.put_parameter(Name=name, Value=value, Type=choice(param_types)) 62 | 63 | input_ssm_parameters.update({"does_not_exist": ("/does/not/exist", "not_exists")}) 64 | 65 | with pytest.raises(ParametersNotFound): 66 | actual = resolve_ssm_parameters( 67 | ssm_client, 68 | {key: name for key, (name, value) in input_ssm_parameters.items()}, 69 | ) 70 | expected = {key: value for key, (name, value) in input_ssm_parameters.items()} 71 | assert actual == expected 72 | assert len(actual) == len(expected) 73 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/glue_interface.py: -------------------------------------------------------------------------------- 1 | """A collection of functions to interface with AWS GLUE.""" 2 | import logging 3 | from typing import List, Dict, Sequence, Iterable 4 | 5 | LOGGER = logging.getLogger(__name__) 6 | 7 | 8 | def get_glue_args(arguments: Sequence, options: List[str] = None) -> Dict: 9 | """ 10 | Parse Arguments supplied to the Job. 11 | 12 | Parameters 13 | ---------- 14 | arguments 15 | Sequence of options and values to be parsed. (sys.argv) 16 | options 17 | Options which value is resolved. 18 | 19 | Returns 20 | ------- 21 | Parsed options and values. 22 | 23 | """ 24 | LOGGER.debug("Parsing arguments for PySpark job") 25 | from awsglue.utils import getResolvedOptions 26 | 27 | LOGGER.debug("Parsing arguments: %s options: %s", arguments, options) 28 | if not options: 29 | return getResolvedOptions(args=arguments, options=["JOB_NAME"]) 30 | return getResolvedOptions(arguments, options=["JOB_NAME"] + options) 31 | 32 | 33 | def get_spark_session_and_glue_job( 34 | glue_args: Dict, 35 | conf=None, 36 | py_files: Iterable[str] = None, 37 | extra_jars: List[str] = None, 38 | ): 39 | """ 40 | Get spark session and AWS glue job. 41 | 42 | Parameters 43 | ---------- 44 | glue_args 45 | Dictionary of Argument Name: Argument value 46 | extra_jars 47 | Path to dependent jar files 48 | conf : Union[pyspark.SparkConf, Dict[str, str]] 49 | Spark config, either object or dictionary of config options. 50 | py_files 51 | Paths to python files (.py, .zip, .egg) 52 | 53 | Returns 54 | ------- 55 | pyspark.sql.SparkSession, awsglue.job.Job 56 | 57 | """ 58 | from awsglue.context import GlueContext 59 | from awsglue.job import Job 60 | from pyspark import SparkContext, SparkConf 61 | 62 | LOGGER.debug("Creating spark session with parameters") 63 | LOGGER.debug("conf=%s", conf) 64 | LOGGER.debug("py_files=%s", py_files) 65 | LOGGER.debug("extra_jars=%s", extra_jars) 66 | if isinstance(conf, dict): 67 | spark_conf = SparkConf() 68 | spark_conf.setAll(conf.items()) 69 | elif isinstance(conf, SparkConf): 70 | spark_conf = conf 71 | else: 72 | spark_conf = None 73 | 74 | if extra_jars and spark_conf: 75 | spark_dependencies = ",".join(extra_jars) 76 | spark_conf.set("spark.jars.packages", spark_dependencies) 77 | 78 | sc = SparkContext.getOrCreate(conf=spark_conf) 79 | 80 | if py_files: 81 | LOGGER.debug("Adding PYFILEs: %s", py_files) 82 | for py_file in py_files: 83 | sc.addPyFile(py_file) 84 | 85 | glue_context = GlueContext(sparkContext=sc) 86 | job = Job(glue_context=glue_context) 87 | job.init(glue_args["JOB_NAME"], glue_args) 88 | 89 | # .py, .zip or .egg 90 | return glue_context.spark_session, job 91 | 92 | 93 | def commit_job(job): 94 | """Commit AWS glue job.""" 95 | job.commit() 96 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: check-added-large-files # Prevent giant files from being committed (500kB) 6 | - id: check-ast # Simply check whether the files parse as valid python. 7 | - id: check-byte-order-marker # Forbid files which have a UTF-8 byte-order marker 8 | - id: check-builtin-literals # Require literal syntax when initializing empty or zero Python builtin types. 9 | - id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems 10 | - id: check-docstring-first # Checks a common error of defining a docstring after code. 11 | - id: check-merge-conflict # Check for files that contain merge conflict strings. 12 | - id: check-toml # This hook checks toml files for parseable syntax. 13 | - id: check-yaml # This hook checks yaml files for parseable syntax. 14 | args: 15 | - --unsafe 16 | - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source. 17 | - id: detect-aws-credentials # Detects *your* aws credentials from the aws cli credentials file 18 | args: 19 | - --allow-missing-credentials 20 | - id: detect-private-key # Detects the presence of private keys 21 | - id: end-of-file-fixer # Ensures that a file is either empty, or ends with one newline. 22 | - id: forbid-new-submodules # Prevent addition of new git submodules 23 | - id: mixed-line-ending # Replaces or checks mixed line ending 24 | # - id: no-commit-to-branch # Don't commit to branch 25 | - id: trailing-whitespace # This hook trims trailing whitespace. 26 | args: ["--markdown-linebreak-ext=md"] 27 | # MyPy 28 | - repo: https://github.com/pre-commit/mirrors-mypy 29 | rev: v0.782 30 | hooks: 31 | - id: mypy 32 | name: mypy (ds1/raw_to_refined) 33 | exclude: (docs/|tests/) 34 | files: ^glue/data_sources/ds1/raw_to_refined/ 35 | - id: mypy 36 | name: mypy (ds1/refined_to_curated) 37 | exclude: (docs/|tests/) 38 | files: ^glue/data_sources/ds1/refined_to_curated/ 39 | - id: mypy 40 | name: mypy(dummy_job) 41 | files: ^glue/data_sources/dummy_job/dummy_transition/ 42 | exclude: (docs/|tests/) 43 | - id: mypy 44 | name: mypy(shared_lib) 45 | files: ^glue/shared/glue_shared_lib 46 | exclude: (docs/|tests/) 47 | # Black 48 | - repo: https://github.com/ambv/black 49 | rev: stable 50 | hooks: 51 | - id: black 52 | args: 53 | - "-l 100" 54 | # Bandit 55 | - repo: https://github.com/PyCQA/bandit 56 | rev: 1.6.2 57 | hooks: 58 | - id: bandit 59 | exclude: (docker/|tests/) 60 | # Flake8 61 | - repo: https://gitlab.com/pycqa/flake8 62 | rev: 3.8.3 63 | hooks: 64 | - id: flake8 65 | exclude: ^(tests/) 66 | args: 67 | - --max-line-length=100 68 | # Terraform 69 | # for this hook to work you need to have terraform-docs, TFLint, TFSec installed. 70 | - repo: git://github.com/antonbabenko/pre-commit-terraform 71 | rev: v1.43.0 72 | hooks: 73 | # - id: terraform_fmt 74 | - id: terraform_docs 75 | - id: terraform_tflint 76 | args: 77 | - --args=--enable-rule=terraform_deprecated_index 78 | - --args=--enable-rule=terraform_unused_declarations 79 | - --args=--enable-rule=terraform_comment_syntax 80 | - --args=--enable-rule=terraform_documented_outputs 81 | - --args=--enable-rule=terraform_documented_variables 82 | - --args=--enable-rule=terraform_typed_variables 83 | - --args=--enable-rule=terraform_naming_convention 84 | -------------------------------------------------------------------------------- /terraform/solution/glue_jobs_dummy.tf: -------------------------------------------------------------------------------- 1 | ###################### Glue Dummy Jobs for complex workflow demonstration ######################################## 2 | locals { 3 | dummy_job_location = "s3://${var.glue_bucket_name}/code/dummy_job/dummy_transition/dummy_transition.py" 4 | } 5 | module "dummy_job" { 6 | for_each = toset( 7 | [ 8 | "dummy_job_0", 9 | "dummy_job_1", 10 | "dummy_job_2", 11 | "dummy_job_3", 12 | "dummy_job_4", 13 | "dummy_job_5", 14 | ] 15 | ) 16 | source = "../modules/glue-job/python_shell" 17 | name = each.value 18 | script_location = local.dummy_job_location 19 | role_arn = module.glue_role.iam_role_arn 20 | default_arguments = { 21 | "--job-bookmark-option" = "job-bookmark-disable" 22 | "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 23 | "--APP_SETTINGS_ENVIRONMENT" = "dev" 24 | "--LOG_LEVEL" = "DEBUG" 25 | "--S3_BUCKET" = var.glue_bucket_name 26 | } 27 | tags = var.tags 28 | } 29 | # 30 | #module "dummy_job_2" { 31 | # source = "../modules/glue-job/python_shell" 32 | # name = "dummy_job_2" 33 | # script_location = local.dummy_job_location 34 | # role_arn = module.glue_role.iam_role_arn 35 | # default_arguments = { 36 | # "--job-bookmark-option" = "job-bookmark-disable" 37 | # "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 38 | # "--APP_SETTINGS_ENVIRONMENT" = "dev" 39 | # "--LOG_LEVEL" = "DEBUG" 40 | # "--S3_BUCKET" = var.glue_bucket_name 41 | # } 42 | # tags = var.tags 43 | #} 44 | # 45 | #module "dummy_job_3" { 46 | # source = "../modules/glue-job/python_shell" 47 | # name = "dummy_job_3" 48 | # script_location = local.dummy_job_location 49 | # role_arn = module.glue_role.iam_role_arn 50 | # default_arguments = { 51 | # "--job-bookmark-option" = "job-bookmark-disable" 52 | # "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 53 | # "--APP_SETTINGS_ENVIRONMENT" = "dev" 54 | # "--LOG_LEVEL" = "DEBUG" 55 | # "--S3_BUCKET" = var.glue_bucket_name 56 | # } 57 | # tags = var.tags 58 | #} 59 | # 60 | #module "dummy_job_4" { 61 | # source = "../modules/glue-job/python_shell" 62 | # name = "dummy_job_4" 63 | # script_location = local.dummy_job_location 64 | # role_arn = module.glue_role.iam_role_arn 65 | # default_arguments = { 66 | # "--job-bookmark-option" = "job-bookmark-disable" 67 | # "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 68 | # "--APP_SETTINGS_ENVIRONMENT" = "dev" 69 | # "--LOG_LEVEL" = "DEBUG" 70 | # "--S3_BUCKET" = var.glue_bucket_name 71 | # } 72 | # tags = var.tags 73 | #} 74 | # 75 | #module "dummy_job_5" { 76 | # source = "../modules/glue-job/python_shell" 77 | # name = "dummy_job_5" 78 | # script_location = local.dummy_job_location 79 | # role_arn = module.glue_role.iam_role_arn 80 | # default_arguments = { 81 | # "--job-bookmark-option" = "job-bookmark-disable" 82 | # "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 83 | # "--APP_SETTINGS_ENVIRONMENT" = "dev" 84 | # "--LOG_LEVEL" = "DEBUG" 85 | # "--S3_BUCKET" = var.glue_bucket_name 86 | # } 87 | # tags = var.tags 88 | #} 89 | # 90 | #module "dummy_job_6" { 91 | # source = "../modules/glue-job/python_shell" 92 | # name = "dummy_job_6" 93 | # script_location = "s3://${var.glue_bucket_name}/code/${module.dummy_job_1.job_name}/dummy_transition/dummy_transition.py" 94 | # role_arn = module.glue_role.iam_role_arn 95 | # default_arguments = { 96 | # "--job-bookmark-option" = "job-bookmark-disable" 97 | # "--TempDir" = "s3://${var.glue_bucket_name}/glue-temp" 98 | # "--APP_SETTINGS_ENVIRONMENT" = "dev" 99 | # "--LOG_LEVEL" = "DEBUG" 100 | # "--S3_BUCKET" = var.glue_bucket_name 101 | # } 102 | # tags = var.tags 103 | #} 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aws-glue-monorepo-style 2 | 3 | An example of AWS Glue Jobs and workflow deployment with terraform in monorepo style. 4 | 5 | To learn more about decisions behind this structure chek out the supporting articles: 6 | https://dev.to/1oglop1/aws-glue-first-experience-part-1-how-to-run-your-code-3pe3 7 | 8 | ![architecture of this solution](arch_diagram.png) 9 | (for simplicity this solution uses just 1 bucket and does not deploy database) 10 | 11 | ## Deployment: 12 | 13 | Requirements: 14 | 15 | * AWS Account 16 | * S3 bucket to store terraform state. 17 | * Rename `.evn.example` to `.env` and set the values 18 | * export environment variables from `.env` using command: `set -o allexport; source .env; set +o allexport` 19 | * `docker-compose up -d` 20 | * `docker exec -it glue /bin/bash` 21 | 22 | Now we are going to work inside the docker container 23 | 24 | * `make tf-init` 25 | * `make tf-plan` 26 | * `make tf-apply` 27 | * `make jobs-deploy` 28 | 29 | That's it! 30 | If everything went well you can now go to AWS Glue Console and explore jobs and workflows. 31 | 32 | Or start workflow from CLI `aws glue start-workflow-run --name etl-workflow--simple` 33 | 34 | Once you are finished with observations remove everything with `make tf-destroy`. 35 | 36 | ## Development 37 | 38 | With the [release of Glue 2.0 AWS](https://aws.amazon.com/blogs/big-data/developing-aws-glue-etl-jobs-locally-using-a-container/) 39 | released official Glue Docker Image you can use it for local development of glue jobs. 40 | 41 | example: 42 | 43 | * `docker exec -it glue /bin/bash` to connect into our container 44 | * `cd /project/glue/data_sources/ds1/raw_to_refined` 45 | * `pip install -r requirements.txt` 46 | * Run the fist job `python raw_to_refined.py --APP_SETTINGS_ENVIRONMENT=dev --LOG_LEVEL=DEBUG --S3_BUCKET=${TF_VAR_glue_bucket_name}` 47 | * `cd /project/glue/data_sources/ds1/refined_to_curated` 48 | * Next step requires results from previous stage `raw_to_refined` 49 | * Run the second job `python refined_to_curated.py --APP_SETTINGS_ENVIRONMENT=dev --LOG_LEVEL=DEBUG --S3_BUCKET=${TF_VAR_glue_bucket_name}` 50 | 51 | If everything went well you should see output like this: 52 | 53 | ``` 54 | 2020-12-23 14:28:43,278 DEBUG glue_shared.spark_helpers - DF: +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+ 55 | | name| mfr| type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight| cups| rating| 56 | +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+ 57 | | String|Categorical|Categorical| Int| Int|Int| Int|Float|Float| Int| Int| Int| Int| Float|Float| Float| 58 | | 100% Bran| N| C| 70| 4| 1| 130| 10| 5| 6| 280| 25| 3| 1| 0.33|68.402973| 59 | | 100% Natural Bran| Q| C| 120| 3| 5| 15| 2| 8| 8| 135| 0| 3| 1| 1|33.983679| 60 | | All-Bran| K| C| 70| 4| 1| 260| 9| 7| 5| 320| 25| 3| 1| 0.33|59.425505| 61 | |All-Bran with Ext...| K| C| 50| 4| 0| 140| 14| 8| 0| 330| 25| 3| 1| 0.5|93.704912| 62 | | Almond Delight| R| C| 110| 2| 2| 200| 1| 14| 8| -1| 25| 3| 1| 0.75|34.384843| 63 | |Apple Cinnamon Ch...| G| C| 110| 2| 2| 180| 1.5| 10.5| 10| 70| 25| 1| 1| 0.75|29.509541| 64 | | Apple Jacks| K| C| 110| 2| 0| 125| 1| 11| 14| 30| 25| 2| 1| 1|33.174094| 65 | | Basic 4| G| C| 130| 3| 2| 210| 2| 18| 8| 100| 25| 3| 1.33| 0.75|37.038562| 66 | | Bran Chex| R| C| 90| 2| 1| 200| 4| 15| 6| 125| 25| 1| 1| 0.67|49.120253| 67 | +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+ 68 | only showing top 10 rows 69 | ``` 70 | 71 | Commands above start PySpark inside the container and look for files stored in S3 `/ds1/refined` 72 | PS. You should avoid running local PySpark on large datasets! 73 | 74 | ## Disclaimer 75 | 76 | Please keep in mind that IAM roles used in this example are very broad and should not be used as is. 77 | -------------------------------------------------------------------------------- /dummy_data/cereal.csv: -------------------------------------------------------------------------------- 1 | name;mfr;type;calories;protein;fat;sodium;fiber;carbo;sugars;potass;vitamins;shelf;weight;cups;rating 2 | String;Categorical;Categorical;Int;Int;Int;Int;Float;Float;Int;Int;Int;Int;Float;Float;Float 3 | 100% Bran;N;C;70;4;1;130;10;5;6;280;25;3;1;0.33;68.402973 4 | 100% Natural Bran;Q;C;120;3;5;15;2;8;8;135;0;3;1;1;33.983679 5 | All-Bran;K;C;70;4;1;260;9;7;5;320;25;3;1;0.33;59.425505 6 | All-Bran with Extra Fiber;K;C;50;4;0;140;14;8;0;330;25;3;1;0.5;93.704912 7 | Almond Delight;R;C;110;2;2;200;1;14;8;-1;25;3;1;0.75;34.384843 8 | Apple Cinnamon Cheerios;G;C;110;2;2;180;1.5;10.5;10;70;25;1;1;0.75;29.509541 9 | Apple Jacks;K;C;110;2;0;125;1;11;14;30;25;2;1;1;33.174094 10 | Basic 4;G;C;130;3;2;210;2;18;8;100;25;3;1.33;0.75;37.038562 11 | Bran Chex;R;C;90;2;1;200;4;15;6;125;25;1;1;0.67;49.120253 12 | Bran Flakes;P;C;90;3;0;210;5;13;5;190;25;3;1;0.67;53.313813 13 | Cap'n'Crunch;Q;C;120;1;2;220;0;12;12;35;25;2;1;0.75;18.042851 14 | Cheerios;G;C;110;6;2;290;2;17;1;105;25;1;1;1.25;50.764999 15 | Cinnamon Toast Crunch;G;C;120;1;3;210;0;13;9;45;25;2;1;0.75;19.823573 16 | Clusters;G;C;110;3;2;140;2;13;7;105;25;3;1;0.5;40.400208 17 | Cocoa Puffs;G;C;110;1;1;180;0;12;13;55;25;2;1;1;22.736446 18 | Corn Chex;R;C;110;2;0;280;0;22;3;25;25;1;1;1;41.445019 19 | Corn Flakes;K;C;100;2;0;290;1;21;2;35;25;1;1;1;45.863324 20 | Corn Pops;K;C;110;1;0;90;1;13;12;20;25;2;1;1;35.782791 21 | Count Chocula;G;C;110;1;1;180;0;12;13;65;25;2;1;1;22.396513 22 | Cracklin' Oat Bran;K;C;110;3;3;140;4;10;7;160;25;3;1;0.5;40.448772 23 | Cream of Wheat (Quick);N;H;100;3;0;80;1;21;0;-1;0;2;1;1;64.533816 24 | Crispix;K;C;110;2;0;220;1;21;3;30;25;3;1;1;46.895644 25 | Crispy Wheat & Raisins;G;C;100;2;1;140;2;11;10;120;25;3;1;0.75;36.176196 26 | Double Chex;R;C;100;2;0;190;1;18;5;80;25;3;1;0.75;44.330856 27 | Froot Loops;K;C;110;2;1;125;1;11;13;30;25;2;1;1;32.207582 28 | Frosted Flakes;K;C;110;1;0;200;1;14;11;25;25;1;1;0.75;31.435973 29 | Frosted Mini-Wheats;K;C;100;3;0;0;3;14;7;100;25;2;1;0.8;58.345141 30 | Fruit & Fibre Dates, Walnuts, and Oats;P;C;120;3;2;160;5;12;10;200;25;3;1.25;0.67;40.917047 31 | Fruitful Bran;K;C;120;3;0;240;5;14;12;190;25;3;1.33;0.67;41.015492 32 | Fruity Pebbles;P;C;110;1;1;135;0;13;12;25;25;2;1;0.75;28.025765 33 | Golden Crisp;P;C;100;2;0;45;0;11;15;40;25;1;1;0.88;35.252444 34 | Golden Grahams;G;C;110;1;1;280;0;15;9;45;25;2;1;0.75;23.804043 35 | Grape Nuts Flakes;P;C;100;3;1;140;3;15;5;85;25;3;1;0.88;52.076897 36 | Grape-Nuts;P;C;110;3;0;170;3;17;3;90;25;3;1;0.25;53.371007 37 | Great Grains Pecan;P;C;120;3;3;75;3;13;4;100;25;3;1;0.33;45.811716 38 | Honey Graham Ohs;Q;C;120;1;2;220;1;12;11;45;25;2;1;1;21.871292 39 | Honey Nut Cheerios;G;C;110;3;1;250;1.5;11.5;10;90;25;1;1;0.75;31.072217 40 | Honey-comb;P;C;110;1;0;180;0;14;11;35;25;1;1;1.33;28.742414 41 | Just Right Crunchy Nuggets;K;C;110;2;1;170;1;17;6;60;100;3;1;1;36.523683 42 | Just Right Fruit & Nut;K;C;140;3;1;170;2;20;9;95;100;3;1.3;0.75;36.471512 43 | Kix;G;C;110;2;1;260;0;21;3;40;25;2;1;1.5;39.241114 44 | Life;Q;C;100;4;2;150;2;12;6;95;25;2;1;0.67;45.328074 45 | Lucky Charms;G;C;110;2;1;180;0;12;12;55;25;2;1;1;26.734515 46 | Maypo;A;H;100;4;1;0;0;16;3;95;25;2;1;1;54.850917 47 | Muesli Raisins, Dates, & Almonds;R;C;150;4;3;95;3;16;11;170;25;3;1;1;37.136863 48 | Muesli Raisins, Peaches, & Pecans;R;C;150;4;3;150;3;16;11;170;25;3;1;1;34.139765 49 | Mueslix Crispy Blend;K;C;160;3;2;150;3;17;13;160;25;3;1.5;0.67;30.313351 50 | Multi-Grain Cheerios;G;C;100;2;1;220;2;15;6;90;25;1;1;1;40.105965 51 | Nut&Honey Crunch;K;C;120;2;1;190;0;15;9;40;25;2;1;0.67;29.924285 52 | Nutri-Grain Almond-Raisin;K;C;140;3;2;220;3;21;7;130;25;3;1.33;0.67;40.692320 53 | Nutri-grain Wheat;K;C;90;3;0;170;3;18;2;90;25;3;1;1;59.642837 54 | Oatmeal Raisin Crisp;G;C;130;3;2;170;1.5;13.5;10;120;25;3;1.25;0.5;30.450843 55 | Post Nat. Raisin Bran;P;C;120;3;1;200;6;11;14;260;25;3;1.33;0.67;37.840594 56 | Product 19;K;C;100;3;0;320;1;20;3;45;100;3;1;1;41.503540 57 | Puffed Rice;Q;C;50;1;0;0;0;13;0;15;0;3;0.5;1;60.756112 58 | Puffed Wheat;Q;C;50;2;0;0;1;10;0;50;0;3;0.5;1;63.005645 59 | Quaker Oat Squares;Q;C;100;4;1;135;2;14;6;110;25;3;1;0.5;49.511874 60 | Quaker Oatmeal;Q;H;100;5;2;0;2.7;-1;-1;110;0;1;1;0.67;50.828392 61 | Raisin Bran;K;C;120;3;1;210;5;14;12;240;25;2;1.33;0.75;39.259197 62 | Raisin Nut Bran;G;C;100;3;2;140;2.5;10.5;8;140;25;3;1;0.5;39.703400 63 | Raisin Squares;K;C;90;2;0;0;2;15;6;110;25;3;1;0.5;55.333142 64 | Rice Chex;R;C;110;1;0;240;0;23;2;30;25;1;1;1.13;41.998933 65 | Rice Krispies;K;C;110;2;0;290;0;22;3;35;25;1;1;1;40.560159 66 | Shredded Wheat;N;C;80;2;0;0;3;16;0;95;0;1;0.83;1;68.235885 67 | Shredded Wheat 'n'Bran;N;C;90;3;0;0;4;19;0;140;0;1;1;0.67;74.472949 68 | Shredded Wheat spoon size;N;C;90;3;0;0;3;20;0;120;0;1;1;0.67;72.801787 69 | Smacks;K;C;110;2;1;70;1;9;15;40;25;2;1;0.75;31.230054 70 | Special K;K;C;110;6;0;230;1;16;3;55;25;1;1;1;53.131324 71 | Strawberry Fruit Wheats;N;C;90;2;0;15;3;15;5;90;25;2;1;1;59.363993 72 | Total Corn Flakes;G;C;110;2;1;200;0;21;3;35;100;3;1;1;38.839746 73 | Total Raisin Bran;G;C;140;3;1;190;4;15;14;230;100;3;1.5;1;28.592785 74 | Total Whole Grain;G;C;100;3;1;200;3;16;3;110;100;3;1;1;46.658844 75 | Triples;G;C;110;2;1;250;0;21;3;60;25;3;1;0.75;39.106174 76 | Trix;G;C;110;1;1;140;0;13;12;25;25;2;1;1;27.753301 77 | Wheat Chex;R;C;100;3;1;230;3;17;3;115;25;1;1;0.67;49.787445 78 | Wheaties;G;C;100;3;1;200;3;17;3;110;25;1;1;1;51.592193 79 | Wheaties Honey Gold;G;C;110;2;1;200;1;16;8;60;25;1;1;0.75;36.187559 80 | -------------------------------------------------------------------------------- /glue/shared/glue_shared_lib/src/glue_shared/boto3_helpers.py: -------------------------------------------------------------------------------- 1 | """Convenient methods requiring boto3 client.""" 2 | import json 3 | import logging 4 | from typing import Dict 5 | from urllib.parse import urlsplit 6 | 7 | from glue_shared.exceptions import ParametersNotFound, JobFailedError, DataNotAvailable 8 | from glue_shared.helpers import chunked 9 | 10 | LOGGER = logging.getLogger(__name__) 11 | 12 | 13 | def resolve_ssm_parameters(ssm_client, parameters: Dict) -> Dict: 14 | """ 15 | Resolve multiple SSM parameters from a dict. 16 | 17 | Parameters 18 | ---------- 19 | ssm_client 20 | boto3.client('ssm') 21 | parameters 22 | A dictionary of friendly names and parameter names. 23 | 24 | Examples 25 | -------- 26 | >>> import boto3 27 | ... resolve_ssm_parameters(boto3.client('ssm'), {"db_host": "/dev/db/HOST"}) 28 | {'db_host': 'value'} 29 | 30 | Returns 31 | ------- 32 | dict 33 | The original dict with resolved values instead of ssm paths. 34 | 35 | """ 36 | tmp = {value: key for key, value in parameters.items()} 37 | 38 | valid_parameters = [] 39 | invalid_parameters = [] 40 | 41 | for chunk in chunked(tuple(tmp.keys()), 10): 42 | response = ssm_client.get_parameters(Names=chunk, WithDecryption=True) 43 | valid_parameters.extend(response["Parameters"]) 44 | invalid_parameters.extend(response["InvalidParameters"]) 45 | 46 | if invalid_parameters: 47 | raise ParametersNotFound("Unable to get parameters.", *invalid_parameters) 48 | 49 | tmp.update({param["Name"]: param["Value"] for param in valid_parameters}) 50 | 51 | return {key: tmp[value] for key, value in parameters.items()} 52 | 53 | 54 | def get_connection(glue_client, name: str) -> Dict: 55 | """ 56 | Get connection properties. 57 | 58 | Parameters 59 | ---------- 60 | glue_client 61 | boto3.client('glue') 62 | name 63 | A connection name. 64 | 65 | Examples 66 | -------- 67 | >>> import boto3 68 | ... get_connection(boto3.client('glue'), "connection-name"}) 69 | { 70 | 'NAME': '', 71 | 'TYPE': '' 72 | 'JDBC_CONNECTION_URL': '' 73 | 'PASSSWORD': '', 74 | 'USERNAME': ', 75 | } 76 | 77 | Returns 78 | ------- 79 | A dictionary of connection properties. 80 | Mapping of the dictionary is a simplified version of 81 | boto3 response. 82 | { 83 | 'NAME': 'Name', 84 | 'TYPE': 'ConnectionType' 85 | ... then follows ConnectionProperties 86 | 'JDBC_CONNECTION_URL': '' 87 | 'PASSSWORD': '', 88 | 'USERNAME': ', 89 | ...if jdbc 90 | 'HOST': '', 91 | 'PORT': '', 92 | 'DATABASE': '', 93 | } 94 | 95 | """ 96 | response = glue_client.get_connection(Name=name) 97 | 98 | ret = { 99 | "NAME": response["Connection"]["Name"], 100 | "TYPE": response["Connection"]["ConnectionType"], 101 | **response["Connection"]["ConnectionProperties"], 102 | } 103 | 104 | if response["Connection"]["ConnectionType"] == "JDBC": 105 | surl = urlsplit( 106 | response["Connection"]["ConnectionProperties"]["JDBC_CONNECTION_URL"].lstrip("jdbc:") 107 | ) 108 | 109 | ret.update( 110 | { 111 | "HOST": surl.hostname, 112 | "PORT": surl.port, 113 | "DATABASE": surl.path.lstrip("/"), 114 | } 115 | ) 116 | 117 | return ret 118 | 119 | 120 | def gracefully_exit( 121 | sns_client, 122 | sns_topic_arn, 123 | process_results: Dict, 124 | job_result: str = "PASS", 125 | message: str = "Job failed.", 126 | ): 127 | """ 128 | Update workflow status SSM parameter and exit with error if job failed. 129 | 130 | Parameters 131 | ---------- 132 | sns_client 133 | boto3.client('sns') 134 | sns_topic_arn 135 | Workflow notification topic ARN. 136 | process_results 137 | A dictionary containing the results of the processing. 138 | This dict must contain json serialisable object. 139 | job_result 140 | Job result FAIL or PASS. # is not validated here. 141 | message 142 | Exit message if job fails. 143 | 144 | """ 145 | LOGGER.debug("Sending SNS message") 146 | LOGGER.debug("Process results: %s", process_results) 147 | sns_client.publish(TopicArn=sns_topic_arn, Message=json.dumps(process_results)) 148 | if job_result == "FAIL": 149 | LOGGER.debug("Exiting with message. %s", message) 150 | raise JobFailedError(message) 151 | 152 | 153 | def get_s3_keys(client, bucket_name: str, prefix: str): 154 | """ 155 | Get keys from S3 bucket by prefix 156 | Parameters 157 | ---------- 158 | client 159 | boto3.client('s3') 160 | bucket_name 161 | prefix 162 | 163 | Yields 164 | ------- 165 | list_objects_v2 Response 166 | 167 | """ 168 | paginator = client.get_paginator("list_objects_v2") 169 | response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) 170 | LOGGER.info("Reading S3: s3://%s/%s", bucket_name, prefix) 171 | for idx, page in enumerate(response_iterator): 172 | try: 173 | for response in page["Contents"]: 174 | yield response["Key"] 175 | except KeyError: 176 | raise DataNotAvailable(f"No data available at: s3://{bucket_name}/{prefix}") 177 | --------------------------------------------------------------------------------