├── .terraform-version
├── glue
    ├── shared
    │   └── glue_shared_lib
    │   │   ├── tests
    │   │       ├── __init__.py
    │   │       ├── conftest.py
    │   │       ├── test_helpers.py
    │   │       ├── test_glue_functions.py
    │   │       ├── test_utils.py
    │   │       └── test_boto3_helpers.py
    │   │   ├── requirements-dev.txt
    │   │   ├── src
    │   │       └── glue_shared
    │   │       │   ├── exceptions.py
    │   │       │   ├── str2obj.py
    │   │       │   ├── __init__.py
    │   │       │   ├── spark_helpers.py
    │   │       │   ├── helpers.py
    │   │       │   ├── pandas_helpers.py
    │   │       │   ├── argument_handlers.py
    │   │       │   ├── defaults.py
    │   │       │   ├── glue_interface.py
    │   │       │   └── boto3_helpers.py
    │   │   ├── setup.py
    │   │   └── Makefile
    └── data_sources
    │   ├── dummy_job
    │       └── dummy_transition
    │       │   ├── requirements.txt
    │       │   ├── dummy_transition.py
    │       │   └── Makefile
    │   └── ds1
    │       ├── refined_to_curated
    │           ├── requirements.txt
    │           ├── config.py
    │           ├── refined_to_curated.py
    │           └── Makefile
    │       └── raw_to_refined
    │           ├── requirements.txt
    │           ├── config.py
    │           ├── raw_to_refined.py
    │           └── Makefile
├── _images
    ├── job_1.png
    ├── job_2.png
    └── job_3.png
├── arch_diagram.png
├── terraform
    ├── modules
    │   ├── glue-workflow
    │   │   ├── outputs.tf
    │   │   ├── main.tf
    │   │   ├── variables.tf
    │   │   └── README.md
    │   ├── iam-role
    │   │   ├── main.tf
    │   │   ├── outputs.tf
    │   │   ├── variables.tf
    │   │   └── README.md
    │   ├── s3-bucket
    │   │   ├── outputs.tf
    │   │   ├── main.tf
    │   │   ├── variables.tf
    │   │   └── README.md
    │   ├── glue-job
    │   │   ├── 2.0
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   ├── README.md
    │   │   │   └── variables.tf
    │   │   └── python_shell
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   ├── README.md
    │   │   │   └── variables.tf
    │   └── iam-policy
    │   │   ├── main.tf
    │   │   ├── variables.tf
    │   │   └── README.md
    └── solution
    │   ├── terraform.tfvars
    │   ├── variables.tf
    │   ├── provider.tf
    │   ├── README.md
    │   ├── glue_workflow_simple.tf
    │   ├── s3.tf
    │   ├── .terraform.lock.hcl
    │   ├── iam.tf
    │   ├── glue_workflow_complex.tf
    │   ├── glue_jobs.tf
    │   └── glue_jobs_dummy.tf
├── .env.example
├── Dockerfile
├── docker-compose.yml
├── Makefile
├── LICENSE
├── glue-jobs.sh
├── .gitignore
├── .pre-commit-config.yaml
├── README.md
└── dummy_data
    └── cereal.csv


/.terraform-version:
--------------------------------------------------------------------------------
1 | 0.14.3
2 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/conftest.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/glue/data_sources/dummy_job/dummy_transition/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | moto==1.3.14
2 | 


--------------------------------------------------------------------------------
/_images/job_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_1.png


--------------------------------------------------------------------------------
/_images/job_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_2.png


--------------------------------------------------------------------------------
/_images/job_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/_images/job_3.png


--------------------------------------------------------------------------------
/arch_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1oglop1/aws-glue-monorepo-style/HEAD/arch_diagram.png


--------------------------------------------------------------------------------
/glue/data_sources/ds1/refined_to_curated/requirements.txt:
--------------------------------------------------------------------------------
1 | file:../../../shared/glue_shared_lib#egg=glue-shared
2 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/raw_to_refined/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.0.3
2 | pyarrow==2.0.0
3 | s3fs==0.4.2
4 | file:../../../shared/glue_shared_lib#egg=glue-shared
5 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-workflow/outputs.tf:
--------------------------------------------------------------------------------
1 | output "workflow_name" {
2 |   description="AWS Glue Workflow Name"
3 |   value = aws_glue_workflow.glue_workflow.name
4 | }
5 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-role/main.tf:
--------------------------------------------------------------------------------
1 | resource "aws_iam_role" "role" {
2 |   name               = var.iam_role_name
3 |   assume_role_policy = var.assume_role_policy
4 |   tags               = var.tags
5 | }
6 | 


--------------------------------------------------------------------------------
/terraform/modules/s3-bucket/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   description = "S3 bucket ID"
 3 |   value       = aws_s3_bucket.s3.id
 4 | }
 5 | 
 6 | output "arn" {
 7 |   description = "S3 bucket ARN"
 8 |   value       = aws_s3_bucket.s3.arn
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/2.0/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "job_name" {
 2 |   description="AWS Glue Job Name"
 3 |   value = aws_glue_job.job.name
 4 | }
 5 | 
 6 | output "job_arn" {
 7 |   description="AWS Glue Job ARN"
 8 |   value = aws_glue_job.job.arn
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/python_shell/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "job_name" {
 2 |   description="AWS Glue Job Name"
 3 |   value = aws_glue_job.job.name
 4 | }
 5 | 
 6 | output "job_arn" {
 7 |   description="AWS Glue Job ARN"
 8 |   value = aws_glue_job.job.arn
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-role/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "iam_role_id" {
 2 |   description = "IAM role id"
 3 |   value       = aws_iam_role.role.id
 4 | }
 5 | 
 6 | output "iam_role_arn" {
 7 |   description = "IAM role ARN"
 8 |   value       = aws_iam_role.role.arn
 9 | }
10 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | TF_VAR_glue_bucket_name="bucket-where-glue-items-are-stored"
2 | TF_STATE_BUCKET="your-tf-state-bucket-name"
3 | AWS_REGION=us-east-1
4 | AWS_DEFAULT_REGION=us-east-1
5 | AWS_SECRET_ACCESS_KEY=<your secret access key>
6 | AWS_ACCESS_KEY_ID=<your access key id>
7 | 


--------------------------------------------------------------------------------
/glue/data_sources/dummy_job/dummy_transition/dummy_transition.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A dummy transition for workflow simulation
 3 | """
 4 | 
 5 | import datetime
 6 | 
 7 | 
 8 | def main():
 9 |     print(f"Dummy job runs at: {datetime.datetime.now()}")
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-policy/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_iam_policy" "policy" {
 2 |   name   = var.iam_role_policy_name
 3 |   policy = var.iam_role_policy
 4 | }
 5 | 
 6 | resource "aws_iam_policy_attachment" "attach_policy" {
 7 |   name       = var.attachment_name
 8 |   roles      = var.roles
 9 |   policy_arn = aws_iam_policy.policy.arn
10 | }
11 | 


--------------------------------------------------------------------------------
/terraform/solution/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | 
 2 | ########################################
 3 | # Account metadata
 4 | ########################################
 5 | 
 6 | #assume_role_name  = "terraform-user"
 7 | #infra_provisioner = "terraform"
 8 | region = "us-east-1"
 9 | 
10 | tags = {
11 |   "terraform" = "true"
12 | }
13 | 
14 | #glue_role = "glue-role"
15 | 


--------------------------------------------------------------------------------
/terraform/solution/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   type        = string
 3 |   description = "AWS region name"
 4 |   default     = "us-east-1"
 5 | }
 6 | variable "tags" {
 7 |   type        = map(string)
 8 |   description = "AWS resource tags"
 9 |   default     = {}
10 | }
11 | 
12 | variable "glue_bucket_name" {
13 |   type        = string
14 |   description = "S3 bucket name where glue jobs are stored"
15 | }
16 | 


--------------------------------------------------------------------------------
/terraform/solution/provider.tf:
--------------------------------------------------------------------------------
 1 | ########################################
 2 | # Provider
 3 | ########################################
 4 | terraform {
 5 |   required_version = "0.14.3"
 6 |   backend "s3" {
 7 |     key = ""
 8 |   }
 9 |   required_providers {
10 |     aws = {
11 |       source  = "hashicorp/aws"
12 |       version = "~> 3.22.0"
13 |     }
14 |   }
15 | }
16 | 
17 | provider "aws" {
18 |   region = var.region
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-workflow/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_glue_workflow" "glue_workflow" {
 2 |   name = var.workflow_name
 3 | }
 4 | 
 5 | resource "aws_glue_security_configuration" "glue_security" {
 6 |   name = var.security_name
 7 | 
 8 |   encryption_configuration {
 9 |     cloudwatch_encryption {
10 |     }
11 | 
12 |     job_bookmarks_encryption {
13 |     }
14 | 
15 |     s3_encryption {
16 |       s3_encryption_mode = "SSE-S3"
17 |     }
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-workflow/variables.tf:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | ##               Module variables
 3 | ########################################################
 4 | 
 5 | variable "workflow_name" {
 6 |   type        = string
 7 |   description = "The name you assign to this workflow."
 8 | }
 9 | 
10 | 
11 | variable "security_name" {
12 |   type        = string
13 |   description = "Name of the security configuration."
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/modules/s3-bucket/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "s3" {
 2 |   bucket        = var.bucket_name
 3 |   force_destroy = true
 4 |   lifecycle {
 5 |     prevent_destroy = false
 6 |   }
 7 |   versioning {
 8 |     enabled = var.versioning_enabled
 9 |   }
10 |   tags = var.tags
11 | 
12 | 
13 |   server_side_encryption_configuration {
14 |     rule {
15 |       apply_server_side_encryption_by_default {
16 |         sse_algorithm = "AES256"
17 |       }
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/terraform/modules/s3-bucket/variables.tf:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | ##               Module variables
 3 | ########################################################
 4 | 
 5 | variable "bucket_name" {
 6 |   type        = string
 7 |   description = "The name of the bucket"
 8 | }
 9 | 
10 | variable "tags" {
11 |   type        = map(any)
12 |   description = "Tags associated with the bucket"
13 | }
14 | 
15 | variable "versioning_enabled" {
16 |   type        = string
17 |   description = "Enable versioning"
18 | }
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amazon/aws-glue-libs:glue_libs_1.0.0_image_01
 2 | 
 3 | ARG TERRAFORM_VERSION="0.14.3"
 4 | 
 5 | RUN curl https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip > terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \
 6 |   unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip -d /bin && \
 7 |   rm -f terraform_${TERRAFORM_VERSION}_linux_amd64.zip
 8 | 
 9 | RUN pip install -U pip && \
10 |     pip install -U wheel && \
11 |     pip install -U setuptools && \
12 |     pip install -U awscli boto3
13 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Exceptions specific to glue_shared library."""
 2 | 
 3 | 
 4 | class GlueJobError(Exception):
 5 |     """Base glue error."""
 6 | 
 7 | 
 8 | class ParametersNotFound(GlueJobError):
 9 |     """SSM parameters not found."""
10 | 
11 | 
12 | class DataNotAvailable(GlueJobError):
13 |     """Data not available."""
14 | 
15 | 
16 | class JobFailedError(GlueJobError):
17 |     """It looks like SystemExit is caught by glue, hence this is needed."""
18 | 
19 | 
20 | class IllegalArgumentError(ValueError):
21 |     """Illegal arguments supplied."""
22 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | def test_chunked():
 2 |     from glue_shared.helpers import chunked
 3 | 
 4 |     l1 = [x for x in range(10)]
 5 |     l2 = [x for x in range(12)]
 6 | 
 7 |     assert tuple(chunked(l1, 3)) == ([0, 1, 2], [3, 4, 5], [6, 7, 8], [9])
 8 |     assert tuple(chunked(l1, 5)) == ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])
 9 |     assert tuple(chunked(l1, 10)) == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],)
10 |     assert tuple(chunked(l2, 3)) == ([0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11])
11 |     assert tuple(chunked(l2, 10)) == ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11])
12 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   glue:
 5 |     container_name: glue
 6 |     build:
 7 |       dockerfile: "./Dockerfile"
 8 |       context: "./"
 9 |     command: "tail -f /dev/null"
10 |     working_dir: "/project"
11 |     volumes:
12 |       - "./:/project"
13 |     environment:
14 |       TF_VAR_glue_bucket_name: ${TF_VAR_glue_bucket_name}
15 |       TF_STATE_BUCKET: ${TF_STATE_BUCKET}
16 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
17 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
18 |       AWS_REGION: ${AWS_REGION}
19 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION}
20 |       TERRAFORM_VERSION: 0.14.3
21 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-role/variables.tf:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | ##               Module variables
 3 | ########################################################
 4 | 
 5 | variable "iam_role_name" {
 6 |   type        = string
 7 |   description = "The name of the role. If omitted, Terraform will assign a random, unique name."
 8 | }
 9 | 
10 | variable "assume_role_policy" {
11 |   type        = string
12 |   description = "The policy that grants an entity permission to assume the role."
13 | }
14 | 
15 | variable "tags" {
16 |   type        = map(any)
17 |   description = "Key-value mapping of tags for the IAM role"
18 | }
19 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/2.0/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_glue_job" "job" {
 2 |   name        = var.name
 3 |   connections = var.connections
 4 | 
 5 |   number_of_workers = var.number_of_workers
 6 |   worker_type = var.worker_type
 7 | 
 8 |   max_retries = var.max_retries
 9 | 
10 |   glue_version = "2.0"
11 | 
12 |   command {
13 |     name            = "glueetl"
14 |     script_location = var.script_location
15 |   }
16 | 
17 |   default_arguments = var.default_arguments
18 |   description       = var.description
19 |   execution_property {
20 |     max_concurrent_runs = var.max_concurrent_runs
21 |   }
22 | 
23 |   role_arn = var.role_arn
24 |   tags     = var.tags
25 | }
26 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/python_shell/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_glue_job" "job" {
 2 |   name        = var.name
 3 |   connections = var.connections
 4 | 
 5 |   max_capacity = var.max_capacity
 6 | 
 7 |   max_retries = var.max_retries
 8 | 
 9 |   glue_version = "1.0"
10 | 
11 |   command {
12 |     name            = "pythonshell"
13 |     script_location = var.script_location
14 |     python_version  = var.python_version
15 |   }
16 | 
17 |   default_arguments = var.default_arguments
18 |   description       = var.description
19 |   execution_property {
20 |     max_concurrent_runs = var.max_concurrent_runs
21 |   }
22 | 
23 |   role_arn = var.role_arn
24 |   tags     = var.tags
25 | }
26 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-workflow/README.md:
--------------------------------------------------------------------------------
 1 | # Generated docs
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | security\_name | Name of the security configuration. | `string` | n/a | yes |
19 | | workflow\_name | The name you assign to this workflow. | `string` | n/a | yes |
20 | 
21 | ## Outputs
22 | 
23 | | Name | Description |
24 | |------|-------------|
25 | | workflow\_name | AWS Glue Workflow Name |
26 | 
27 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
28 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/test_glue_functions.py:
--------------------------------------------------------------------------------
 1 | def test_get_glue_args():
 2 |     from types import ModuleType
 3 |     import sys
 4 | 
 5 |     sys_argv = ["--APP_SETTINGS_ENVIRONMENT", "dev", "--JOB_NAME", "job"]
 6 |     expected = {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_NAME": "job"}
 7 | 
 8 |     def getResolvedOptions(args, options):
 9 |         """Fake version of awsglue.utils.getResolvedOptions."""
10 |         return {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_NAME": "job"}
11 | 
12 |     sys.modules["awsglue.utils"] = ModuleType("awsglue.utils")
13 |     sys.modules["awsglue.utils"].getResolvedOptions = getResolvedOptions
14 | 
15 |     from glue_shared import get_glue_args
16 | 
17 |     assert get_glue_args(sys_argv, ["APP_SETTINGS_ENVIRONMENT"]) == expected
18 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/str2obj.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | from typing import Tuple
 4 | 
 5 | import dateutil.parser
 6 | 
 7 | LOGGER = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def str2bool(value):
11 |     return value.lower() == "true"
12 | 
13 | 
14 | def comma_str_time_2_time_obj(comma_str: str) -> Tuple[datetime.datetime, ...]:
15 |     """
16 |     Convert comma separated time strings into a list of datetime objects.
17 | 
18 |     Parameters
19 |     ----------
20 |     comma_str
21 |         Comma separated times: 2020-04-20 16:00:00, 2020-04-20 15:00:00
22 | 
23 |     Returns
24 |     -------
25 |         A list of datetime objects.
26 |     """
27 | 
28 |     return tuple(dateutil.parser.parse(time_str) for time_str in comma_str.split(","))
29 | 


--------------------------------------------------------------------------------
/terraform/solution/README.md:
--------------------------------------------------------------------------------
 1 | # Generated docs
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | | Name | Version |
 7 | |------|---------|
 8 | | terraform | 0.14.3 |
 9 | | aws | ~> 3.22.0 |
10 | 
11 | ## Providers
12 | 
13 | | Name | Version |
14 | |------|---------|
15 | | aws | ~> 3.22.0 |
16 | 
17 | ## Inputs
18 | 
19 | | Name | Description | Type | Default | Required |
20 | |------|-------------|------|---------|:--------:|
21 | | glue\_bucket\_name | S3 bucket name where glue jobs are stored | `string` | n/a | yes |
22 | | region | AWS region name | `string` | `"us-east-1"` | no |
23 | | tags | AWS resource tags | `map(string)` | `{}` | no |
24 | 
25 | ## Outputs
26 | 
27 | No output.
28 | 
29 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
30 | 


--------------------------------------------------------------------------------
/terraform/modules/s3-bucket/README.md:
--------------------------------------------------------------------------------
 1 | # Generated docs
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | bucket\_name | The name of the bucket | `string` | n/a | yes |
19 | | tags | Tags associated with the bucket | `map(any)` | n/a | yes |
20 | | versioning\_enabled | Enable versioning | `string` | n/a | yes |
21 | 
22 | ## Outputs
23 | 
24 | | Name | Description |
25 | |------|-------------|
26 | | arn | S3 bucket ARN |
27 | | id | S3 bucket ID |
28 | 
29 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
30 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/refined_to_curated/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Config file for iot refined to curated Glue job.
 3 | """
 4 | import logging.config
 5 | import sys
 6 | 
 7 | from glue_shared import parse_args
 8 | from glue_shared.defaults import default_logging_config
 9 | 
10 | region = "us-east-1"
11 | arguments = parse_args(sys.argv, ["APP_SETTINGS_ENVIRONMENT", "LOG_LEVEL", "S3_BUCKET"])
12 | 
13 | LOGGING_CONFIG = default_logging_config(arguments["LOG_LEVEL"])
14 | logging.config.dictConfig(LOGGING_CONFIG)
15 | 
16 | JOB_CONFIG = dict(arguments)
17 | # must be hard-coded because glue does not provide this in PyShell jobs
18 | JOB_CONFIG["JOB_NAME"] = JOB_CONFIG.get("JOB_NAME") or "refined-to-curated"
19 | JOB_CONFIG["JOB_ID"] = JOB_CONFIG.get("JOB_ID")
20 | JOB_CONFIG["JOB_RUN_ID"] = JOB_CONFIG.get("JOB_RUN_ID")
21 | 
22 | JOB_CONFIG["s3_prefix"] = "ds1/refined"
23 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="glue_shared",
 5 |     author="Jan Gazda",
 6 |     author_email="jan.gazda@cloudreach.com",
 7 |     python_requires=">=3.6",
 8 |     classifiers=[
 9 |         "Development Status :: 2 - Pre-Alpha",
10 |         "Intended Audience :: Developers",
11 |         "Natural Language :: English",
12 |         "Programming Language :: Python :: 3.6",
13 |         "Programming Language :: Python :: 3.7",
14 |         "Programming Language :: Python :: 3.8",
15 |     ],
16 |     description="Helper library for AWS Glue jobs.",
17 |     setup_requires=["wheel"],
18 |     package_dir={"": "src"},
19 |     packages=find_packages(where="src", exclude=["contrib", "docs", "tests"]),
20 |     test_suite="tests",
21 |     version="0.0.1",
22 |     zip_safe=True,
23 | )
24 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-role/README.md:
--------------------------------------------------------------------------------
 1 | # Generated docs
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | assume\_role\_policy | The policy that grants an entity permission to assume the role. | `string` | n/a | yes |
19 | | iam\_role\_name | The name of the role. If omitted, Terraform will assign a random, unique name. | `string` | n/a | yes |
20 | | tags | Key-value mapping of tags for the IAM role | `map(any)` | n/a | yes |
21 | 
22 | ## Outputs
23 | 
24 | | Name | Description |
25 | |------|-------------|
26 | | iam\_role\_arn | IAM role ARN |
27 | | iam\_role\_id | IAM role id |
28 | 
29 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
30 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/__init__.py:
--------------------------------------------------------------------------------
 1 | """Collection of convenient functions shared among the glue jobs."""
 2 | 
 3 | import logging
 4 | 
 5 | from glue_shared.argument_handlers import parse_args
 6 | from glue_shared.boto3_helpers import (
 7 |     resolve_ssm_parameters,
 8 |     get_connection,
 9 |     gracefully_exit,
10 | )
11 | from glue_shared.glue_interface import (
12 |     get_glue_args,
13 |     get_spark_session_and_glue_job,
14 |     commit_job,
15 | )
16 | from glue_shared.str2obj import str2bool, comma_str_time_2_time_obj
17 | 
18 | LOGGER = logging.getLogger("glue_shared")
19 | LOGGER.addHandler(logging.NullHandler())
20 | 
21 | __all__ = [
22 |     "parse_args",
23 |     "resolve_ssm_parameters",
24 |     "get_connection",
25 |     "gracefully_exit",
26 |     "get_glue_args",
27 |     "get_spark_session_and_glue_job",
28 |     "commit_job",
29 |     "str2bool",
30 |     "comma_str_time_2_time_obj",
31 | ]
32 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/spark_helpers.py:
--------------------------------------------------------------------------------
 1 | """Useful functions to simplify Spark functionality."""
 2 | import logging
 3 | 
 4 | LOGGER = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def read_parquet(spark, path):
 8 |     """Read the data from a raw zone bucket."""
 9 |     LOGGER.info("Reading parquet data from %s", path)
10 |     df = spark.read.parquet(path)
11 |     LOGGER.debug("DF: %s", show_spark_df(df, 10))
12 |     return df
13 | 
14 | 
15 | def show_spark_df(df, n=20, truncate=True, vertical=False):
16 |     """
17 |     Show DataFrame as str, useful for logging.
18 | 
19 |     Notes
20 |     -----
21 |     Reimplemented from:
22 |     https://spark.apache.org/docs/2.4.5/api/python/_modules/pyspark/sql/dataframe.html#DataFrame.show
23 |     """
24 |     if isinstance(truncate, bool) and truncate:
25 |         return df._jdf.showString(n, 20, vertical)
26 |     else:
27 |         return df._jdf.showString(n, int(truncate), vertical)
28 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-policy/variables.tf:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | ##               Module variables
 3 | ########################################################
 4 | 
 5 | variable "iam_role_policy_name" {
 6 |   type        = string
 7 |   description = "The name of the policy. If omitted, Terraform will assign a random, unique name."
 8 | }
 9 | 
10 | variable "iam_role_policy" {
11 |   type        = string
12 |   description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform"
13 | }
14 | 
15 | variable "attachment_name" {
16 |   type        = string
17 |   description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform"
18 | }
19 | 
20 | variable "roles" {
21 |   type        = list(any)
22 |   description = "The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform"
23 | }
24 | 


--------------------------------------------------------------------------------
/terraform/solution/glue_workflow_simple.tf:
--------------------------------------------------------------------------------
 1 | module "glue_workflow_simple" {
 2 |   source        = "../modules/glue-workflow"
 3 |   workflow_name = "etl-workflow-simple"
 4 |   security_name = "glueSecurityConfigSimple"
 5 | }
 6 | 
 7 | ###################### Glue Triggers and DAG ########################################
 8 | 
 9 | resource "aws_glue_trigger" "start_raw_to_refined" {
10 |   name = "start_raw_to_refined"
11 |   type = "ON_DEMAND"
12 |   workflow_name = module.glue_workflow_simple.workflow_name
13 |   actions {
14 |     job_name = module.ds1_raw_to_refined_job.job_name
15 |   }
16 | }
17 | 
18 | resource "aws_glue_trigger" "run_refined_to_curated" {
19 |   name = "run_refined_to_curated"
20 |   type = "CONDITIONAL"
21 |   workflow_name = module.glue_workflow_simple.workflow_name
22 |   actions {
23 |     job_name = module.ds1_refined_to_curated_job.job_name
24 |   }
25 | 
26 |   predicate {
27 |     conditions {
28 |       job_name = module.ds1_raw_to_refined_job.job_name
29 |       state    = "SUCCEEDED"
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/raw_to_refined/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration file for ds1-raw-to-refined Python Shell Glue job.
 3 | """
 4 | 
 5 | import logging.config
 6 | import sys
 7 | 
 8 | from glue_shared import parse_args
 9 | from glue_shared.defaults import default_logging_config
10 | 
11 | arguments = parse_args(sys.argv, ["APP_SETTINGS_ENVIRONMENT", "LOG_LEVEL", "S3_BUCKET"])
12 | 
13 | LOGGING_CONFIG = default_logging_config(arguments["LOG_LEVEL"])
14 | logging.config.dictConfig(LOGGING_CONFIG)
15 | 
16 | JOB_CONFIG = dict(arguments)
17 | # must be hard-coded because glue does not provide this in PyShell jobs
18 | JOB_CONFIG["JOB_NAME"] = JOB_CONFIG.get("JOB_NAME") or "ds1-raw-to-refined"
19 | JOB_CONFIG["JOB_ID"] = JOB_CONFIG.get("JOB_ID")
20 | JOB_CONFIG["JOB_RUN_ID"] = JOB_CONFIG.get("JOB_RUN_ID")
21 | 
22 | JOB_CONFIG["WORKFLOW_NAME"] = JOB_CONFIG.get("WORKFLOW_NAME")
23 | JOB_CONFIG["WORKFLOW_RUN_ID"] = JOB_CONFIG.get("WORKFLOW_RUN_ID")
24 | 
25 | # raw data
26 | 
27 | JOB_CONFIG["s3_raw_prefix"] = "ds1/raw"
28 | JOB_CONFIG["s3_refined_prefix"] = "ds1/refined"
29 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/raw_to_refined/raw_to_refined.py:
--------------------------------------------------------------------------------
 1 | """
 2 | AWS GLUE PyShell Job to process RAW data.
 3 | From Raw zone to Refined zone.
 4 | """
 5 | import logging
 6 | import pandas as pd
 7 | from glue_shared.pandas_helpers import write_parquet
 8 | 
 9 | LOGGER = logging.getLogger("job")
10 | 
11 | 
12 | def main():
13 |     LOGGER.info("JOB_NAME: %s", JOB_CONFIG["JOB_NAME"])
14 |     LOGGER.info("JOB_ID: %s", JOB_CONFIG["JOB_ID"])
15 |     LOGGER.info("JOB_RUN_ID %s", JOB_CONFIG["JOB_RUN_ID"])
16 | 
17 |     LOGGER.info("WORKFLOW_NAME: %s", JOB_CONFIG["WORKFLOW_NAME"])
18 |     LOGGER.info("WORKFLOW_RUN_ID %s", JOB_CONFIG["WORKFLOW_RUN_ID"])
19 |     data_src = f"s3://{JOB_CONFIG['S3_BUCKET']}/{JOB_CONFIG['s3_raw_prefix']}/cereal.csv"
20 |     LOGGER.info("Reading raw data from %s", data_src)
21 |     df = pd.read_csv(data_src, sep=";")
22 |     LOGGER.info("DF shape %s", df.shape)
23 |     write_parquet(df, f"s3://{JOB_CONFIG['S3_BUCKET']}/{JOB_CONFIG['s3_refined_prefix']}")
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     from config import JOB_CONFIG
28 | 
29 |     main()
30 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/refined_to_curated/refined_to_curated.py:
--------------------------------------------------------------------------------
 1 | """
 2 | AWS GLUE PySpark Job to process REFINED data.
 3 | From Refined zone to Curated zone.
 4 | """
 5 | import datetime
 6 | import logging
 7 | 
 8 | import pyspark
 9 | from glue_shared import get_spark_session_and_glue_job
10 | from glue_shared.spark_helpers import read_parquet
11 | 
12 | LOGGER = logging.getLogger("job")
13 | 
14 | 
15 | def run_etl(cfg, spark: pyspark.sql.SQLContext):
16 |     df = read_parquet(spark, f"s3://{cfg['S3_BUCKET']}/{cfg['s3_prefix']}")
17 |     LOGGER.debug("Count in: %s", df.count())
18 |     LOGGER.debug("Here we can continue processing data and write them to the curated zone.")
19 | 
20 | 
21 | def main():
22 |     spark, job = get_spark_session_and_glue_job(JOB_CONFIG)
23 |     LOGGER.debug("Spark job started at: %s", datetime.datetime.utcnow().isoformat())
24 | 
25 |     run_etl(JOB_CONFIG, spark)
26 | 
27 |     LOGGER.debug("Spark job finished at: %s", datetime.datetime.utcnow().isoformat())
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     from config import JOB_CONFIG
32 | 
33 |     main()
34 | 


--------------------------------------------------------------------------------
/terraform/solution/s3.tf:
--------------------------------------------------------------------------------
 1 | ##################### S3 Buckets #######################
 2 | module "s3_bucket_all" {
 3 |   source             = "../modules/s3-bucket"
 4 |   bucket_name        = var.glue_bucket_name
 5 |   versioning_enabled = false
 6 |   tags               = var.tags
 7 | 
 8 | }
 9 | 
10 | resource "aws_s3_bucket_object" "ds1_raw_folder" {
11 |   key                    = "/ds1/raw/"
12 |   bucket                 = module.s3_bucket_all.id
13 |   server_side_encryption = "AES256"
14 | }
15 | 
16 | resource "aws_s3_bucket_object" "raw_data_file" {
17 |   bucket = module.s3_bucket_all.id
18 |   key    = "/ds1/raw/cereal.csv"
19 |   source = "../../dummy_data/cereal.csv"
20 | }
21 | 
22 | resource "aws_s3_bucket_object" "ds1_refined_folder" {
23 |   key                    = "/ds1/refined/yes"
24 |   bucket                 = module.s3_bucket_all.id
25 |   server_side_encryption = "AES256"
26 | }
27 | 
28 | resource "aws_s3_bucket_object" "code_folder" {
29 |   key                    = "/code/"
30 |   bucket                 = module.s3_bucket_all.id
31 |   server_side_encryption = "AES256"
32 | }
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(CURDIR)
 2 | TF_DIR=$(BASEDIR)/terraform/solution
 3 | 
 4 | # Check if deploy environment is set!
 5 | variables := TF_STATE_BUCKET
 6 | 
 7 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined))
 8 | $(foreach 1,$(variables),$(fatal_if_undefined))
 9 | 
10 | 
11 | .PHONY: help
12 | help:		## This help.
13 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
14 | 
15 | tf-apply:  ## terraform apply
16 | 	cd $(TF_DIR) && terraform apply -auto-approve
17 | 
18 | tf-init:  ## terraform init
19 | 	cd $(TF_DIR) && terraform init -backend-config "bucket=${TF_STATE_BUCKET}" -backend-config "key=tf.state"
20 | 
21 | tf-plan:  ## terraform plan
22 | 	cd $(TF_DIR) && terraform plan
23 | 
24 | tf-destroy:  ## terraform destroy
25 | 	cd $(TF_DIR) && terraform destroy -force
26 | 
27 | jobs-deploy:  ## deploy glue jobs
28 | 	bash glue-jobs.sh deploy
29 | 
30 | jobs-package:  ## package glue jobs
31 | 	bash glue-jobs.sh package
32 | 
33 | jobs-clean:  ## clean glue jobs
34 | 	bash glue-jobs.sh clean
35 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help
 2 | help:	## This help.
 3 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
 6 | 
 7 | clean-build: ## Remove build artifacts
 8 | 	rm -fr build/
 9 | 	rm -fr dist/
10 | 	rm -fr .eggs/
11 | 	find . -name '*.egg-info' -exec rm -fr {} +
12 | 	find . -name '*.egg' -exec rm -f {} +
13 | 
14 | clean-pyc: ## Remove Python file artifacts
15 | 	find . -name '*.pyc' -exec rm -f {} +
16 | 	find . -name '*.pyo' -exec rm -f {} +
17 | 	find . -name '*~' -exec rm -f {} +
18 | 	find . -name '__pycache__' -exec rm -fr {} +
19 | 
20 | clean-test: ## Remove test and coverage artifacts
21 | 	rm -fr .tox/
22 | 	rm -f .coverage
23 | 	rm -fr htmlcov/
24 | 	rm -fr .pytest_cache
25 | 
26 | install: clean  ## Install in a current environment
27 | 	python setup.py install
28 | 
29 | install-dev: ## Install in development mode
30 | 	python setup.py develop
31 | 
32 | wheel: clean ## Build wheel
33 | 	python setup.py bdist_wheel
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jan Gazda
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/terraform/modules/iam-policy/README.md:
--------------------------------------------------------------------------------
 1 | # Generated docs
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | attachment\_name | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `string` | n/a | yes |
19 | | iam\_role\_policy | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `string` | n/a | yes |
20 | | iam\_role\_policy\_name | The name of the policy. If omitted, Terraform will assign a random, unique name. | `string` | n/a | yes |
21 | | roles | The policy document. This is a JSON formatted string. For more information about building AWS IAM policy documents with Terraform | `list(any)` | n/a | yes |
22 | 
23 | ## Outputs
24 | 
25 | No output.
26 | 
27 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
28 | 


--------------------------------------------------------------------------------
/terraform/solution/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/aws" {
 5 |   version     = "3.22.0"
 6 |   constraints = "~> 3.22.0"
 7 |   hashes = [
 8 |     "h1:8aWXjFcmEi64P0TMHOCQXWws+/SmvJQrNvHlzdktKOM=",
 9 |     "h1:f/Tz8zv1Zb78ZaiyJkQ0MGIViZwbYrLuQk3kojPM91c=",
10 |     "zh:4a9a66caf1964cdd3b61fb3ebb0da417195a5529cb8e496f266b0778335d11c8",
11 |     "zh:514f2f006ae68db715d86781673faf9483292deab235c7402ff306e0e92ea11a",
12 |     "zh:5277b61109fddb9011728f6650ef01a639a0590aeffe34ed7de7ba10d0c31803",
13 |     "zh:67784dc8c8375ab37103eea1258c3334ee92be6de033c2b37e3a2a65d0005142",
14 |     "zh:76d4c8be2ca4a3294fb51fb58de1fe03361d3bc403820270cc8e71a04c5fa806",
15 |     "zh:8f90b1cfdcf6e8fb1a9d0382ecaa5056a3a84c94e313fbf9e92c89de271cdede",
16 |     "zh:d0ac346519d0df124df89be2d803eb53f373434890f6ee3fb37112802f9eac59",
17 |     "zh:d6256feedada82cbfb3b1dd6dd9ad02048f23120ab50e6146a541cb11a108cc1",
18 |     "zh:db2fe0d2e77c02e9a74e1ed694aa352295a50283f9a1cf896e5be252af14e9f4",
19 |     "zh:eda61e889b579bd90046939a5b40cf5dc9031fb5a819fc3e4667a78bd432bdb2",
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/glue-jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | # This script packages orchestrates packaging and uploading of all glue jobs.
 4 | # The idea is to run different Make targets to get desired effect.
 5 | # While each Glue Job should comply with the structure and provide
 6 | # necessary Make targets:
 7 | # package - create a deployable package
 8 | # test - run tests
 9 | # deploy - make the package and upload it to S3
10 | # In the end S3 content will be glue_job.py and glue_job_deps.zip
11 | 
12 | ARGUMENT=$1
13 | OPTIONS="package|test|deploy|clean"
14 | 
15 | if [[ ${OPTIONS} != *"$ARGUMENT"* ]]; then
16 |   echo "Argument must match one of ${OPTIONS}"
17 |   exit 1
18 | fi
19 | 
20 | echo GLUEING GLUE
21 | # Deploy data source specific jobs
22 | for DIR in glue/data_sources/*/*; do
23 |   if [[ -d ${DIR} ]]; then
24 |     if [[ -e ${DIR}/Makefile ]]; then
25 |       cd ${DIR}
26 |       echo --------${DIR}-------------
27 |       make ${ARGUMENT}
28 |       echo ---------------------------
29 |       cd -
30 |     fi
31 |   fi;
32 | done
33 | 
34 | # Deploy general jobs
35 | for DIR in glue/shared/glue_jobs/*; do
36 |   if [[ -d ${DIR} ]]; then
37 |     if [[ -e ${DIR}/Makefile ]]; then
38 |       cd ${DIR}
39 |       echo --------${DIR}-------------
40 |       make ${ARGUMENT}
41 |       echo ---------------------------
42 |       cd -
43 |     fi
44 |   fi;
45 | done
46 | 


--------------------------------------------------------------------------------
/terraform/solution/iam.tf:
--------------------------------------------------------------------------------
 1 | ###################### Glue IAM ########################################
 2 | 
 3 | module "glue_role" {
 4 |   source             = "../modules/iam-role"
 5 |   iam_role_name      = "glue-role"
 6 |   assume_role_policy = data.aws_iam_policy_document.glue_assume_role_policy.json
 7 |   tags               = var.tags
 8 | }
 9 | 
10 | data "aws_iam_policy_document" "glue_assume_role_policy" {
11 |   statement {
12 |     actions = ["sts:AssumeRole"]
13 | 
14 |     principals {
15 |       type        = "Service"
16 |       identifiers = ["glue.amazonaws.com"]
17 |     }
18 |   }
19 | }
20 | 
21 | ################# Attach AWS Managed Policies ##################
22 | 
23 | resource "aws_iam_policy_attachment" "glue_service_role" {
24 |   name       = "AWSGlueServiceRole"
25 |   roles      = [module.glue_role.iam_role_id]
26 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
27 | }
28 | 
29 | resource "aws_iam_policy_attachment" "s3_full_access" {
30 |   name       = "AmazonS3FullAccess"
31 |   roles      = [module.glue_role.iam_role_id]
32 |   policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
33 | }
34 | 
35 | resource "aws_iam_policy_attachment" "cloudwatch_logs_role" {
36 |   name       = "CloudWatchLogsFullAccess"
37 |   roles      = [module.glue_role.iam_role_id]
38 |   policy_arn = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess"
39 | }
40 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/2.0/README.md:
--------------------------------------------------------------------------------
 1 | # Glue PySpark job
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | connections | The list of connections used for this job. | `list(string)` | `[]` | no |
19 | | default\_arguments | The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide. | `map(string)` | `{}` | no |
20 | | description | Description of the job. | `string` | `""` | no |
21 | | max\_concurrent\_runs | The maximum number of concurrent runs allowed for a job. The default is 1. | `string` | `"1"` | no |
22 | | max\_retries | Number of retries | `string` | `null` | no |
23 | | name | Name of the job | `string` | n/a | yes |
24 | | number\_of\_workers | Number of Glue (G.#X) workers | `number` | `null` | no |
25 | | role\_arn | The ARN of the IAM role associated with this job. | `string` | n/a | yes |
26 | | script\_location | Specifies the S3 path to a script that executes a job. | `string` | n/a | yes |
27 | | tags | AWS resource tags | `map(string)` | `{}` | no |
28 | | worker\_type | Worker type | `string` | `"G.1X"` | no |
29 | 
30 | ## Outputs
31 | 
32 | | Name | Description |
33 | |------|-------------|
34 | | job\_arn | AWS Glue Job ARN |
35 | | job\_name | AWS Glue Job Name |
36 | 
37 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
38 | 


--------------------------------------------------------------------------------
/terraform/solution/glue_workflow_complex.tf:
--------------------------------------------------------------------------------
 1 | module "glue_workflow_complex" {
 2 |   source        = "../modules/glue-workflow"
 3 |   workflow_name = "etl-workflow-complex"
 4 |   security_name = "glueSecurityConfigComplex"
 5 | }
 6 | 
 7 | ###################### Glue Triggers and DAG ########################################
 8 | 
 9 | locals {
10 |   jobs_0_1 = [
11 |     module.dummy_job["dummy_job_0"].job_name,
12 |     module.dummy_job["dummy_job_1"].job_name
13 |   ]
14 |   job_2 = module.dummy_job["dummy_job_2"].job_name
15 |   jobs_3_5 = [
16 |     module.dummy_job["dummy_job_3"].job_name,
17 |     module.dummy_job["dummy_job_4"].job_name,
18 |     module.dummy_job["dummy_job_5"].job_name
19 |   ]
20 | }
21 | 
22 | # starts 2 jobs
23 | resource "aws_glue_trigger" "start_complex" {
24 |   name          = "start_complex"
25 |   type          = "ON_DEMAND"
26 |   workflow_name = module.glue_workflow_complex.workflow_name
27 | 
28 |   dynamic "actions" {
29 |     for_each = local.jobs_0_1
30 |     content {
31 |       job_name = actions.value
32 |     }
33 |   }
34 | }
35 | 
36 | # waits for first 2 jobs to finish
37 | resource "aws_glue_trigger" "complex_stage_2" {
38 |   name          = "complex_stage_2"
39 |   type          = "CONDITIONAL"
40 |   workflow_name = module.glue_workflow_complex.workflow_name
41 | 
42 |   actions {
43 |     job_name = local.job_2
44 |   }
45 |   predicate {
46 |     dynamic "conditions" {
47 |       for_each = local.jobs_0_1
48 |       content {
49 |         job_name = conditions.value
50 |         state    = "SUCCEEDED"
51 |       }
52 |     }
53 |   }
54 | }
55 | 
56 | resource "aws_glue_trigger" "wait_for_second" {
57 |   name = "complex_stage_3"
58 |   type = "CONDITIONAL"
59 |   workflow_name = module.glue_workflow_complex.workflow_name
60 | 
61 |   dynamic "actions" {
62 |     for_each = local.jobs_3_5
63 |     content {
64 |       job_name = actions.value
65 |     }
66 |   }
67 | 
68 |   predicate {
69 |     conditions {
70 |       job_name = local.job_2
71 |       state    = "SUCCEEDED"
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/helpers.py:
--------------------------------------------------------------------------------
 1 | """Helper functions used across this library."""
 2 | import os
 3 | import re
 4 | from functools import partial
 5 | from itertools import islice
 6 | from typing import Tuple
 7 | 
 8 | EXTENSIONS = re.compile(r".+py$|.+zip$|.+egg$")
 9 | 
10 | 
11 | def take(n, iterable):
12 |     """
13 |     Return first n items of the iterable as a list
14 | 
15 |     Notes
16 |     -----
17 |     From itertools recipes:
18 |     https://docs.python.org/3.6/library/itertools.html#itertools-recipes
19 |     """
20 | 
21 |     return list(islice(iterable, n))
22 | 
23 | 
24 | def chunked(iterable, n):
25 |     """Break *iterable* into lists of length *n*:
26 | 
27 |         >>> list(chunked([1, 2, 3, 4, 5, 6], 3))
28 |         [[1, 2, 3], [4, 5, 6]]
29 | 
30 |     If the length of *iterable* is not evenly divisible by *n*, the last
31 |     returned list will be shorter:
32 | 
33 |         >>> list(chunked([1, 2, 3, 4, 5, 6, 7, 8], 3))
34 |         [[1, 2, 3], [4, 5, 6], [7, 8]]
35 | 
36 |     To use a fill-in value instead, see the :func:`grouper` recipe.
37 | 
38 |     :func:`chunked` is useful for splitting up a computation on a large number
39 |     of keys into batches, to be pickled and sent off to worker processes. One
40 |     example is operations on rows in MySQL, which does not implement
41 |     server-side cursors properly and would otherwise load the entire dataset
42 |     into RAM on the client.
43 | 
44 |     Notes
45 |     -----
46 |     Reimplemented from more itertools to avoid the installation of the package.
47 |     https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.chunked
48 |     """
49 |     return iter(partial(take, n, iter(iterable)), [])
50 | 
51 | 
52 | def get_py_zip_egg_files(path: str) -> Tuple[str, ...]:
53 |     """
54 |     Find all .py, .zip, .egg files in sys.path.
55 | 
56 |     This method is a workaround needed for Glue2.0 as of 2020-05-11
57 |     """
58 | 
59 |     return tuple(e.path for e in filter(lambda ent: EXTENSIONS.match(ent.name), os.scandir(path)))
60 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/python_shell/README.md:
--------------------------------------------------------------------------------
 1 | # Glue Python Shell job
 2 | 
 3 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 4 | ## Requirements
 5 | 
 6 | No requirements.
 7 | 
 8 | ## Providers
 9 | 
10 | | Name | Version |
11 | |------|---------|
12 | | aws | n/a |
13 | 
14 | ## Inputs
15 | 
16 | | Name | Description | Type | Default | Required |
17 | |------|-------------|------|---------|:--------:|
18 | | connections | The list of connections used for this job. | `list(string)` | `[]` | no |
19 | | default\_arguments | The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide. | `map(string)` | `{}` | no |
20 | | description | Description of the job. | `string` | `""` | no |
21 | | max\_capacity | The maximum number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. Required when pythonshell is set, accept either 0.0625 or 1.0. | `string` | `"0.0625"` | no |
22 | | max\_concurrent\_runs | The maximum number of concurrent runs allowed for a job. The default is 1. | `string` | `"1"` | no |
23 | | max\_retries | Number of retries | `string` | `null` | no |
24 | | name | Name of the job | `string` | n/a | yes |
25 | | python\_version | The Python version being used to execute a Python shell job. Allowed values are 2 or 3. | `string` | `"3"` | no |
26 | | role\_arn | The ARN of the IAM role associated with this job. | `string` | n/a | yes |
27 | | script\_location | Specifies the S3 path to a script that executes a job. | `string` | n/a | yes |
28 | | tags | AWS resource tags | `map(string)` | `{}` | no |
29 | 
30 | ## Outputs
31 | 
32 | | Name | Description |
33 | |------|-------------|
34 | | job\_arn | AWS Glue Job ARN |
35 | | job\_name | AWS Glue Job Name |
36 | 
37 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
38 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/raw_to_refined/Makefile:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(CURDIR)
 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 3 | MK_PARENT:=$(realpath $(MK_PATH)../)
 4 | JOB_NAME:=$(notdir $(MK_PARENT))
 5 | TRANSITION_STATE:=$(notdir $(BASEDIR))
 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py
 7 | JOB_TRANSITION_ZIP=$(BASEDIR)/$(TRANSITION_STATE).zip
 8 | BUILD_DIR=$(BASEDIR)/dist
 9 | 
10 | variables := TF_VAR_glue_bucket_name
11 | 
12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined))
13 | $(foreach 1,$(variables),$(fatal_if_undefined))
14 | 
15 | .PHONY: help
16 | help:		## This help.
17 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
18 | 
19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
20 | 
21 | clean-build: ## Remove build artifacts
22 | 	rm -fr build/
23 | 	rm -fr dist/
24 | 	rm -fr .eggs/
25 | 	find . -name '*.egg-info' -exec rm -fr {} +
26 | 	find . -name '*.egg' -exec rm -f {} +
27 | 
28 | clean-pyc: ## Remove Python file artifacts
29 | 	find . -name '*.pyc' -exec rm -f {} +
30 | 	find . -name '*.pyo' -exec rm -f {} +
31 | 	find . -name '*~' -exec rm -f {} +
32 | 	find . -name '__pycache__' -exec rm -fr {} +
33 | 
34 | clean-test: ## Remove test and coverage artifacts
35 | 	rm -fr .tox/
36 | 	rm -f .coverage
37 | 	rm -fr htmlcov/
38 | 	rm -fr .pytest_cache
39 | 
40 | test:  ## Run tests
41 | 	echo Tests are not implemented
42 | 
43 | package: ## Build deps package
44 | 	@echo Packaging
45 | 	@mkdir -p $(BUILD_DIR)
46 | 	@pip install wheel
47 | 	cp config.py $(BUILD_DIR)
48 | 	pip wheel -w $(BUILD_DIR) -r requirements.txt --no-deps
49 | 
50 | upload-job: ## Upload job.py file
51 | 	@echo Uploading $(JOB_NAME)
52 | 	aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY)
53 | 
54 | upload: upload-job ## Upload artefacts to S3
55 | 	aws s3 sync --delete $(BUILD_DIR) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies
56 | 
57 | deploy: clean package upload ## Package and upload to S3
58 | 


--------------------------------------------------------------------------------
/glue/data_sources/ds1/refined_to_curated/Makefile:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(CURDIR)
 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 3 | MK_PARENT:=$(realpath $(MK_PATH)../)
 4 | JOB_NAME:=$(notdir $(MK_PARENT))
 5 | TRANSITION_STATE:=$(notdir $(BASEDIR))
 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py
 7 | BUILD_DIR=$(BASEDIR)/dist
 8 | JOB_TRANSITION_ZIP=$(BUILD_DIR)/$(TRANSITION_STATE).zip
 9 | 
10 | variables := TF_VAR_glue_bucket_name
11 | 
12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined))
13 | $(foreach 1,$(variables),$(fatal_if_undefined))
14 | 
15 | .PHONY: help
16 | help:	## This help.
17 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
18 | 
19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
20 | 
21 | clean-build: ## Remove build artifacts
22 | 	rm -fr build/
23 | 	rm -fr dist/
24 | 	rm -fr .eggs/
25 | 	find . -name '*.egg-info' -exec rm -fr {} +
26 | 	find . -name '*.egg' -exec rm -f {} +
27 | 
28 | clean-pyc: ## Remove Python file artifacts
29 | 	find . -name '*.pyc' -exec rm -f {} +
30 | 	find . -name '*.pyo' -exec rm -f {} +
31 | 	find . -name '*~' -exec rm -f {} +
32 | 	find . -name '__pycache__' -exec rm -fr {} +
33 | 
34 | clean-test: ## Remove test and coverage artifacts
35 | 	rm -fr .tox/
36 | 	rm -f .coverage
37 | 	rm -fr htmlcov/
38 | 	rm -fr .pytest_cache
39 | 
40 | test:  ## Run tests
41 | 	echo Tests are not implemented
42 | 
43 | package: ## Build deps package
44 | 	mkdir -p $(BUILD_DIR)
45 | 	cp config.py $(BUILD_DIR)
46 | 	pip install -t $(BUILD_DIR) -r requirements.txt
47 | 	cd $(BUILD_DIR) && zip -r $(JOB_TRANSITION_ZIP) .
48 | 
49 | 
50 | upload-job: ## Upload job.py file
51 | 	@echo Uploading $(JOB_NAME)
52 | 	aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY)
53 | 
54 | upload: upload-job ## Upload artefacts to S3
55 | 	aws s3 cp $(JOB_TRANSITION_ZIP) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies/
56 | 
57 | deploy: clean package upload ## Package and upload to S3
58 | 


--------------------------------------------------------------------------------
/glue/data_sources/dummy_job/dummy_transition/Makefile:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(CURDIR)
 2 | MK_PATH:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 3 | MK_PARENT:=$(realpath $(MK_PATH)../)
 4 | JOB_NAME:=$(notdir $(MK_PARENT))
 5 | TRANSITION_STATE:=$(notdir $(BASEDIR))
 6 | TRANSITION_STATE_PY=$(TRANSITION_STATE).py
 7 | JOB_TRANSITION_ZIP=$(BASEDIR)/$(TRANSITION_STATE).zip
 8 | BUILD_DIR=$(BASEDIR)/dist
 9 | 
10 | variables := TF_VAR_glue_bucket_name
11 | 
12 | fatal_if_undefined = $(if $(findstring undefined,$(origin $1)),$(error Error: variable [$1] is undefined))
13 | $(foreach 1,$(variables),$(fatal_if_undefined))
14 | 
15 | .PHONY: help
16 | help:		## This help.
17 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
18 | 
19 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
20 | 
21 | clean-build: ## Remove build artifacts
22 | 	rm -fr build/
23 | 	rm -fr dist/
24 | 	rm -fr .eggs/
25 | 	find . -name '*.egg-info' -exec rm -fr {} +
26 | 	find . -name '*.egg' -exec rm -f {} +
27 | 
28 | clean-pyc: ## Remove Python file artifacts
29 | 	find . -name '*.pyc' -exec rm -f {} +
30 | 	find . -name '*.pyo' -exec rm -f {} +
31 | 	find . -name '*~' -exec rm -f {} +
32 | 	find . -name '__pycache__' -exec rm -fr {} +
33 | 
34 | clean-test: ## Remove test and coverage artifacts
35 | 	rm -fr .tox/
36 | 	rm -f .coverage
37 | 	rm -fr htmlcov/
38 | 	rm -fr .pytest_cache
39 | 
40 | test:  ## Run tests
41 | 	echo Tests are not implemented
42 | 
43 | package: ## Build deps package
44 | 	@echo Packaging
45 | 	@mkdir -p $(BUILD_DIR)
46 | 	@pip install wheel
47 | 	cp config.py $(BUILD_DIR)
48 | 	pip wheel -w $(BUILD_DIR) -r requirements.txt --no-deps
49 | 
50 | upload-job: ## Upload job.py file
51 | 	@echo Uploading $(JOB_NAME)
52 | 	aws s3 cp $(TRANSITION_STATE_PY) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/$(TRANSITION_STATE_PY)
53 | 
54 | #upload: upload-job ## Upload artefacts to S3
55 | #	aws s3 sync --delete $(BUILD_DIR) s3://$(TF_VAR_glue_bucket_name)/code/$(JOB_NAME)/$(TRANSITION_STATE)/dependencies
56 | 
57 | deploy: clean upload-job ## Package and upload to S3
58 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/pandas_helpers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List
 3 | 
 4 | import pandas as pd
 5 | from glue_shared.boto3_helpers import get_s3_keys
 6 | 
 7 | LOGGER = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def write_parquet(
11 |     df: pd.DataFrame,
12 |     s3_folder_url: str,
13 |     partition_cols: List[str] = None,
14 |     compression: str = None,
15 | ):
16 |     """
17 |     Write Parquet file to S3 folder.
18 | 
19 |     Parameters
20 |     ----------
21 |     df
22 |         Pandas dataframe
23 |     s3_folder_url
24 |         S3 url: s3://<bucket>/<prefix>.
25 |     partition_cols
26 |         Partition path by columns
27 |     compression
28 |         Parquet compression. Default is "snappy"
29 | 
30 |     """
31 | 
32 |     import pyarrow as pa
33 |     import pyarrow.parquet as pq
34 |     import s3fs
35 | 
36 |     LOGGER.info("Writing parquet file to S3: %s", f"{s3_folder_url}")
37 |     table = pa.Table.from_pandas(df, preserve_index=False)
38 | 
39 |     pq.write_to_dataset(
40 |         table,
41 |         s3_folder_url,
42 |         filesystem=s3fs.S3FileSystem(),
43 |         partition_cols=partition_cols,
44 |         compression=compression or "snappy",
45 |     )
46 | 
47 | 
48 | def df_from_s3_json(
49 |     s3_client,
50 |     bucket_name: str,
51 |     prefix: str,
52 |     compression: str = None,
53 |     lines: bool = True,
54 | ):
55 |     """
56 |     Create Pandas DataFrame from multiple files in S3 prefix.
57 | 
58 |     Parameters
59 |     ----------
60 |     s3_client
61 |         boto3.client('s3')
62 |     bucket_name
63 |     prefix
64 |     compression
65 |         Json file compression.
66 |     lines
67 |         Multiple JSON objects per line.
68 | 
69 |     Returns
70 |     -------
71 |     pd.DataFrame
72 |         Dataframe containing data under S3 prefix.
73 | 
74 |     """
75 | 
76 |     df_merged = pd.DataFrame()
77 | 
78 |     for key in get_s3_keys(s3_client, bucket_name, prefix):
79 |         resp = s3_client.get_object(Bucket=bucket_name, Key=key)
80 |         df = pd.read_json(resp["Body"], orient="records", lines=lines, compression=compression)
81 |         df_merged = df_merged.append(df, ignore_index=True)
82 | 
83 |     return df_merged
84 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/2.0/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "script_location" {
 2 |   type        = string
 3 |   description = "Specifies the S3 path to a script that executes a job."
 4 | }
 5 | 
 6 | 
 7 | variable "connections" {
 8 |   type        = list(string)
 9 |   description = "The list of connections used for this job."
10 |   default     = []
11 | }
12 | 
13 | variable "default_arguments" {
14 |   type        = map(string)
15 |   description = "The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide."
16 |   default     = {}
17 | }
18 | 
19 | variable "description" {
20 |   type        = string
21 |   description = "Description of the job."
22 |   default     = ""
23 | }
24 | 
25 | variable "max_concurrent_runs" {
26 |   type        = string
27 |   description = "The maximum number of concurrent runs allowed for a job. The default is 1."
28 |   default     = "1"
29 | }
30 | 
31 | 
32 | variable "name" {
33 |   type        = string
34 |   description = "Name of the job"
35 | }
36 | 
37 | variable "role_arn" {
38 |   type        = string
39 |   description = "The ARN of the IAM role associated with this job."
40 | 
41 | }
42 | 
43 | variable "tags" {
44 |   type        = map(string)
45 |   description = "AWS resource tags"
46 |   default     = {}
47 | }
48 | 
49 | variable "number_of_workers" {
50 |   type        = number
51 |   description = "Number of Glue (G.#X) workers"
52 |   default     = null
53 | }
54 | 
55 | variable "worker_type" {
56 |   description = "Worker type"
57 |   type        = string
58 |   default     = "G.1X"
59 |   validation {
60 |     condition     = contains(["G.1X", "G.2X"], var.worker_type)
61 |     error_message = "Worker type can be one of  'G.1X', 'G.2X'."
62 |   }
63 | }
64 | 
65 | variable "max_retries" {
66 |   description = "Number of retries"
67 |   type        = string
68 |   default     = null
69 | }
70 | 


--------------------------------------------------------------------------------
/terraform/solution/glue_jobs.tf:
--------------------------------------------------------------------------------
 1 | ###################### Glue Jobs ########################################
 2 | 
 3 | module "ds1_raw_to_refined_job" {
 4 |   source = "../modules/glue-job/python_shell"
 5 | 
 6 |   name            = "ds1-raw-to-refined"
 7 |   role_arn        = module.glue_role.iam_role_arn
 8 |   script_location = "s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/raw_to_refined.py"
 9 |   default_arguments = {
10 |     "--extra-py-files"           = <<EOF
11 |                                   s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/dependencies/config.py,
12 |                                   s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/dependencies/glue_shared-0.0.1-py3-none-any.whl,
13 |                                   s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/dependencies/s3fs-0.4.2-py3-none-any.whl,
14 |                                   s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/dependencies/pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl,
15 |                                   s3://${var.glue_bucket_name}/code/ds1/raw_to_refined/dependencies/pyarrow-0.16.0-cp36-cp36m-manylinux2014_x86_64.whl
16 |                               EOF
17 |     "--job-bookmark-option"      = "job-bookmark-disable"
18 |     "--APP_SETTINGS_ENVIRONMENT" = "dev"
19 |     "--LOG_LEVEL"                = "DEBUG"
20 |     "--S3_BUCKET" = var.glue_bucket_name
21 |   }
22 |   tags = var.tags
23 | }
24 | 
25 | module "ds1_refined_to_curated_job" {
26 |   source          = "../modules/glue-job/2.0"
27 |   name            = "ds1-refined-to-curated"
28 |   script_location = "s3://${var.glue_bucket_name}/code/ds1/refined_to_curated/refined_to_curated.py"
29 |   role_arn        = module.glue_role.iam_role_arn
30 |   default_arguments = {
31 |     "--extra-py-files"           = "s3://${var.glue_bucket_name}/code/ds1/refined_to_curated/dependencies/refined_to_curated.zip"
32 |     "--job-bookmark-option"      = "job-bookmark-disable"
33 |     "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
34 |     "--job-language"             = "python"
35 |     "--APP_SETTINGS_ENVIRONMENT" = "dev"
36 |     "--LOG_LEVEL"                = "DEBUG"
37 |     "--S3_BUCKET" = var.glue_bucket_name
38 |   }
39 |   number_of_workers = "2"
40 |   tags              = var.tags
41 | }
42 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/argument_handlers.py:
--------------------------------------------------------------------------------
 1 | """Argument handles for Glue jobs until AWS unifies the interface."""
 2 | import argparse
 3 | import logging
 4 | from typing import Sequence, List, Dict
 5 | 
 6 | LOGGER = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class ArgumentError(Exception):
10 |     pass
11 | 
12 | 
13 | class CustomArgumentParser(argparse.ArgumentParser):
14 |     def error(self, msg):
15 |         raise ArgumentError(msg)
16 | 
17 | 
18 | def parse_args_fallback(arguments: Sequence, options: List[str] = None) -> Dict:
19 |     """
20 |     Argument parser fallback for AWS Glue jobs.
21 | 
22 |     This Fallback function is necessary due to lack of API uniformity
23 |     between PySpark and PyShell jobs.
24 | 
25 |     Parameters
26 |     ----------
27 |     arguments
28 |         Sequence of options and values to be parsed. (sys.argv)
29 |     options
30 |         Options which value is resolved.
31 | 
32 |     Returns
33 |     -------
34 |         Parsed options and values.
35 | 
36 |     """
37 |     LOGGER.debug("Parsing arguments with fallback function.")
38 |     LOGGER.debug("Parsing arguments: %s options: %s", arguments, options)
39 |     parser = CustomArgumentParser()
40 |     if not options:
41 |         options = []
42 |     for opt in options:
43 |         parser.add_argument(f"--{opt}", required=True)
44 | 
45 |     args = vars(parser.parse_known_args(arguments[1:])[0])
46 |     return args
47 | 
48 | 
49 | def parse_args(arguments: Sequence, options: List[str] = None) -> Dict:
50 |     """
51 |     Parse input arguments.
52 | 
53 |     Simple assessment that module AWS Glue is not available in pyshell jobs.
54 | 
55 |     Parameters
56 |     ----------
57 |     arguments
58 |         Sequence of options and values to be parsed. (sys.argv)
59 |     options
60 |         Options which value is resolved.
61 | 
62 |     Returns
63 |     -------
64 |         Parsed options and values.
65 | 
66 |     """
67 |     LOGGER.debug("Parsing arguments: %s options: %s", arguments, options)
68 | 
69 |     try:
70 |         import awsglue.utils as au
71 |     except ImportError:
72 |         return parse_args_fallback(arguments, options)
73 | 
74 |     try:
75 |         resolved = au.getResolvedOptions(args=arguments, options=options)
76 |         LOGGER.debug("awsglue.utils args resolved: %s", resolved)
77 |         return resolved
78 |     except au.GlueArgumentError:
79 |         return parse_args_fallback(arguments, options)
80 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from types import ModuleType
 3 | import datetime
 4 | from glue_shared import parse_args
 5 | 
 6 | 
 7 | def test_parse_args_pyshell():
 8 |     argv = [
 9 |         "/tmp/glue-python-scripts-jm72zh6c/jan_pyshell_job.py",
10 |         "--APP_SETTINGS_ENVIRONMENT",
11 |         "dev",
12 |         "--job-bookmark-option",
13 |         "job-bookmark-disable",
14 |         "--job-language",
15 |         "python",
16 |     ]
17 | 
18 |     actual = parse_args(argv, ["APP_SETTINGS_ENVIRONMENT"])
19 |     expected = {"APP_SETTINGS_ENVIRONMENT": "dev"}
20 |     assert actual == expected
21 | 
22 | 
23 | def test_parse_args_glueetl():
24 |     argv = [
25 |         "script_2020-04-15-11-21-57.py",
26 |         "--JOB_NAME",
27 |         "glue-spark-job",
28 |         "--APP_SETTINGS_ENVIRONMENT",
29 |         "dev",
30 |         "--JOB_ID",
31 |         "j_3456789",
32 |         "--JOB_RUN_ID",
33 |         "jr_3456789",
34 |         "--job-bookmark-option",
35 |         "job-bookmark-disable",
36 |         "--TempDir",
37 |         "s3://bucker/Key/dir",
38 |     ]
39 | 
40 |     sys.modules["dynamicframe"] = ModuleType("DynamicFrame")
41 |     sys.modules["dynamicframe"].DynamicFrame = None
42 |     sys.modules["awsglue.utils"] = ModuleType("awsglue.utils")
43 |     sys.modules["awsglue.utils"].getResolvedOptions = lambda arguments, options=None: {
44 |         "APP_SETTINGS_ENVIRONMENT": "dev",
45 |         "JOB_ID": "j_3456789",
46 |     }
47 | 
48 |     actual = parse_args(argv, ["APP_SETTINGS_ENVIRONMENT", "JOB_ID"])
49 |     expected = {"APP_SETTINGS_ENVIRONMENT": "dev", "JOB_ID": "j_3456789"}
50 |     assert actual == expected
51 | 
52 | 
53 | def test_comma_str_time_2_time_obj():
54 |     from glue_shared.str2obj import comma_str_time_2_time_obj
55 | 
56 |     input1 = "2020-04-21 03:00"
57 |     input2 = "2020-04-21 03:00, 2020-04-21 04:00"
58 |     input3 = "2020-04-21 03:00,2020-04-20 02:00"
59 | 
60 |     expected1 = (datetime.datetime(2020, 4, 21, 3, 0),)
61 |     expected2 = (
62 |         datetime.datetime(2020, 4, 21, 3, 0),
63 |         datetime.datetime(2020, 4, 21, 4, 0),
64 |     )
65 | 
66 |     expected3 = (
67 |         datetime.datetime(2020, 4, 21, 3, 0),
68 |         datetime.datetime(2020, 4, 20, 2, 0),
69 |     )
70 |     assert comma_str_time_2_time_obj(input1) == expected1
71 |     assert comma_str_time_2_time_obj(input2) == expected2
72 |     assert comma_str_time_2_time_obj(input3) == expected3
73 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/defaults.py:
--------------------------------------------------------------------------------
 1 | """This module intends to provide base configuration for AWS Glue jobs."""
 2 | import logging
 3 | from typing import Dict
 4 | 
 5 | 
 6 | class InfoDebugFilter(logging.Filter):
 7 |     def filter(self, rec):
 8 |         """Filter debug and info messages."""
 9 |         return rec.levelno in (logging.DEBUG, logging.INFO)
10 | 
11 | 
12 | def default_logging_config(level: str = "INFO", formatter_name: str = "detailed") -> Dict:
13 |     """
14 |     Create default logging config.
15 | 
16 |     Parameters
17 |     ----------
18 |     level
19 |         Log level.
20 |     formatter_name
21 |         Log formatter name. Possible values: detailed or dev.
22 | 
23 |     Returns
24 |     -------
25 |         Dictionary compatible with logging.config.dictConfig.
26 | 
27 |     """
28 |     logging_config = {
29 |         "version": 1,
30 |         "filters": {"info_debug_filter": {"()": InfoDebugFilter}},
31 |         "formatters": {
32 |             "detailed": {
33 |                 "class": "logging.Formatter",
34 |                 "format": "%(asctime)s %(levelname)-8s %(name)-15s - %(message)s",
35 |             },
36 |             "dev": {
37 |                 "class": "logging.Formatter",
38 |                 "format": "%(asctime)s %(levelname)s %(name)s - ++++++++ %(message)s ++++++++",
39 |             },
40 |         },
41 |         "handlers": {
42 |             "debug_handler": {
43 |                 "class": "logging.StreamHandler",
44 |                 "formatter": formatter_name,
45 |                 "level": "DEBUG",
46 |                 "filters": ["info_debug_filter"],
47 |                 "stream": "ext://sys.stdout",
48 |             },
49 |             "warning": {
50 |                 "class": "logging.StreamHandler",
51 |                 "formatter": formatter_name,
52 |                 "level": "WARNING",
53 |                 "stream": "ext://sys.stdout",
54 |             },
55 |         },
56 |         "loggers": {
57 |             "job": {
58 |                 "level": level,
59 |                 "propagate": False,
60 |                 "handlers": ["debug_handler", "warning"],
61 |             },
62 |             "glue_shared": {
63 |                 "level": level,
64 |                 "propagate": False,
65 |                 "handlers": ["debug_handler", "warning"],
66 |             },
67 |         },
68 |         "root": {"level": "WARNING", "handlers": ["debug_handler", "warning"]},
69 |     }
70 | 
71 |     return logging_config
72 | 


--------------------------------------------------------------------------------
/terraform/modules/glue-job/python_shell/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "script_location" {
 2 |   type        = string
 3 |   description = "Specifies the S3 path to a script that executes a job."
 4 | }
 5 | 
 6 | variable "python_version" {
 7 |   type        = string
 8 |   description = "The Python version being used to execute a Python shell job. Allowed values are 2 or 3."
 9 |   default     = "3"
10 |   validation {
11 |     condition     = contains(["2", "3"], var.python_version)
12 |     error_message = "Python version can be only  '2' or '3'."
13 |   }
14 | }
15 | 
16 | variable "connections" {
17 |   type        = list(string)
18 |   description = "The list of connections used for this job."
19 |   default     = []
20 | }
21 | 
22 | variable "default_arguments" {
23 |   type        = map(string)
24 |   description = "The map of default arguments for this job. You can specify arguments here that your own job-execution script consumes, as well as arguments that AWS Glue itself consumes. For information about how to specify and consume your own Job arguments, see the Calling AWS Glue APIs in Python topic in the developer guide. For information about the key-value pairs that AWS Glue consumes to set up your job, see the Special Parameters Used by AWS Glue topic in the developer guide."
25 |   default     = {}
26 | }
27 | 
28 | variable "description" {
29 |   type        = string
30 |   description = "Description of the job."
31 |   default     = ""
32 | }
33 | 
34 | variable "max_concurrent_runs" {
35 |   type        = string
36 |   description = "The maximum number of concurrent runs allowed for a job. The default is 1."
37 |   default     = "1"
38 | }
39 | 
40 | variable "name" {
41 |   type        = string
42 |   description = "Name of the job"
43 | }
44 | 
45 | variable "role_arn" {
46 |   type        = string
47 |   description = "The ARN of the IAM role associated with this job."
48 | 
49 | }
50 | 
51 | variable "tags" {
52 |   type        = map(string)
53 |   description = "AWS resource tags"
54 |   default     = {}
55 | }
56 | 
57 | variable "max_capacity" {
58 |   type        = string
59 |   description = "The maximum number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. Required when pythonshell is set, accept either 0.0625 or 1.0. "
60 |   default     = "0.0625"
61 |   validation {
62 |     condition     = contains(["0.0625", "1.0"], var.max_capacity)
63 |     error_message = "Max capacity for python job must be a string: '0.625' or '1.0'."
64 |   }
65 | }
66 | 
67 | variable "max_retries" {
68 |   description = "Number of retries"
69 |   type        = string
70 |   default     = null
71 | }
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .terraform
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # pytype static type analyzer
137 | .pytype/
138 | 
139 | # Cython debug symbols
140 | cython_debug/


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/tests/test_boto3_helpers.py:
--------------------------------------------------------------------------------
 1 | import moto
 2 | import boto3
 3 | import pytest
 4 | 
 5 | 
 6 | @moto.mock_ssm
 7 | def test_resolve_2_valid_parameters():
 8 |     from glue_shared.boto3_helpers import resolve_ssm_parameters
 9 | 
10 |     ssm_client = boto3.client("ssm")
11 |     ssm_client.put_parameter(Name="/dev/db/host", Value="127.0.0.1", Type="String")
12 |     ssm_client.put_parameter(Name="/dev/db/password", Value="magic", Type="SecureString")
13 | 
14 |     actual = resolve_ssm_parameters(
15 |         ssm_client, {"db_host": "/dev/db/host", "db_password": "/dev/db/password"}
16 |     )
17 | 
18 |     expected = {"db_host": "127.0.0.1", "db_password": "magic"}
19 | 
20 |     assert actual == expected
21 | 
22 | 
23 | @moto.mock_ssm
24 | def test_resolve_12_valid_parameters():
25 |     from glue_shared.boto3_helpers import resolve_ssm_parameters
26 |     from string import ascii_lowercase
27 |     from random import choice
28 | 
29 |     input_ssm_parameters = {
30 |         letter: (f"/{letter}/{letter}", "value") for letter in ascii_lowercase[:12]
31 |     }
32 |     ssm_client = boto3.client("ssm")
33 |     param_types = ["String", "SecureString"]
34 | 
35 |     for key, (name, value) in input_ssm_parameters.items():
36 |         ssm_client.put_parameter(Name=name, Value=value, Type=choice(param_types))
37 | 
38 |     actual = resolve_ssm_parameters(
39 |         ssm_client, {key: name for key, (name, value) in input_ssm_parameters.items()}
40 |     )
41 | 
42 |     expected = {key: value for key, (name, value) in input_ssm_parameters.items()}
43 |     assert actual == expected
44 |     assert len(actual) == len(expected)
45 | 
46 | 
47 | @moto.mock_ssm
48 | def test_resolve_12_valid_1_invalid_parameters():
49 |     from glue_shared.boto3_helpers import resolve_ssm_parameters
50 |     from glue_shared.exceptions import ParametersNotFound
51 |     from string import ascii_lowercase
52 |     from random import choice
53 | 
54 |     input_ssm_parameters = {
55 |         letter: (f"/{letter}/{letter}", "value") for letter in ascii_lowercase[:12]
56 |     }
57 |     ssm_client = boto3.client("ssm")
58 |     param_types = ["String", "SecureString"]
59 | 
60 |     for key, (name, value) in input_ssm_parameters.items():
61 |         ssm_client.put_parameter(Name=name, Value=value, Type=choice(param_types))
62 | 
63 |     input_ssm_parameters.update({"does_not_exist": ("/does/not/exist", "not_exists")})
64 | 
65 |     with pytest.raises(ParametersNotFound):
66 |         actual = resolve_ssm_parameters(
67 |             ssm_client,
68 |             {key: name for key, (name, value) in input_ssm_parameters.items()},
69 |         )
70 |         expected = {key: value for key, (name, value) in input_ssm_parameters.items()}
71 |         assert actual == expected
72 |         assert len(actual) == len(expected)
73 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/glue_interface.py:
--------------------------------------------------------------------------------
 1 | """A collection of functions to interface with AWS GLUE."""
 2 | import logging
 3 | from typing import List, Dict, Sequence, Iterable
 4 | 
 5 | LOGGER = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def get_glue_args(arguments: Sequence, options: List[str] = None) -> Dict:
 9 |     """
10 |     Parse Arguments supplied to the Job.
11 | 
12 |     Parameters
13 |     ----------
14 |     arguments
15 |         Sequence of options and values to be parsed. (sys.argv)
16 |     options
17 |         Options which value is resolved.
18 | 
19 |     Returns
20 |     -------
21 |         Parsed options and values.
22 | 
23 |     """
24 |     LOGGER.debug("Parsing arguments for PySpark job")
25 |     from awsglue.utils import getResolvedOptions
26 | 
27 |     LOGGER.debug("Parsing arguments: %s options: %s", arguments, options)
28 |     if not options:
29 |         return getResolvedOptions(args=arguments, options=["JOB_NAME"])
30 |     return getResolvedOptions(arguments, options=["JOB_NAME"] + options)
31 | 
32 | 
33 | def get_spark_session_and_glue_job(
34 |     glue_args: Dict,
35 |     conf=None,
36 |     py_files: Iterable[str] = None,
37 |     extra_jars: List[str] = None,
38 | ):
39 |     """
40 |     Get spark session and AWS glue job.
41 | 
42 |     Parameters
43 |     ----------
44 |     glue_args
45 |         Dictionary of Argument Name: Argument value
46 |     extra_jars
47 |         Path to dependent jar files
48 |     conf : Union[pyspark.SparkConf, Dict[str, str]]
49 |         Spark config, either object or dictionary of config options.
50 |     py_files
51 |         Paths to python files (.py, .zip, .egg)
52 | 
53 |     Returns
54 |     -------
55 |     pyspark.sql.SparkSession, awsglue.job.Job
56 | 
57 |     """
58 |     from awsglue.context import GlueContext
59 |     from awsglue.job import Job
60 |     from pyspark import SparkContext, SparkConf
61 | 
62 |     LOGGER.debug("Creating spark session with parameters")
63 |     LOGGER.debug("conf=%s", conf)
64 |     LOGGER.debug("py_files=%s", py_files)
65 |     LOGGER.debug("extra_jars=%s", extra_jars)
66 |     if isinstance(conf, dict):
67 |         spark_conf = SparkConf()
68 |         spark_conf.setAll(conf.items())
69 |     elif isinstance(conf, SparkConf):
70 |         spark_conf = conf
71 |     else:
72 |         spark_conf = None
73 | 
74 |     if extra_jars and spark_conf:
75 |         spark_dependencies = ",".join(extra_jars)
76 |         spark_conf.set("spark.jars.packages", spark_dependencies)
77 | 
78 |     sc = SparkContext.getOrCreate(conf=spark_conf)
79 | 
80 |     if py_files:
81 |         LOGGER.debug("Adding PYFILEs: %s", py_files)
82 |         for py_file in py_files:
83 |             sc.addPyFile(py_file)
84 | 
85 |     glue_context = GlueContext(sparkContext=sc)
86 |     job = Job(glue_context=glue_context)
87 |     job.init(glue_args["JOB_NAME"], glue_args)
88 | 
89 |     # .py, .zip or .egg
90 |     return glue_context.spark_session, job
91 | 
92 | 
93 | def commit_job(job):
94 |     """Commit AWS glue job."""
95 |     job.commit()
96 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v3.2.0
 4 |     hooks:
 5 |       - id: check-added-large-files # Prevent giant files from being committed (500kB)
 6 |       - id: check-ast # Simply check whether the files parse as valid python.
 7 |       - id: check-byte-order-marker # Forbid files which have a UTF-8 byte-order marker
 8 |       - id: check-builtin-literals # Require literal syntax when initializing empty or zero Python builtin types.
 9 |       - id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems
10 |       - id: check-docstring-first # Checks a common error of defining a docstring after code.
11 |       - id: check-merge-conflict # Check for files that contain merge conflict strings.
12 |       - id: check-toml # This hook checks toml files for parseable syntax.
13 |       - id: check-yaml # This hook checks yaml files for parseable syntax.
14 |         args:
15 |         - --unsafe
16 |       - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source.
17 |       - id: detect-aws-credentials # Detects *your* aws credentials from the aws cli credentials file
18 |         args:
19 |         - --allow-missing-credentials
20 |       - id: detect-private-key # Detects the presence of private keys
21 |       - id: end-of-file-fixer # Ensures that a file is either empty, or ends with one newline.
22 |       - id: forbid-new-submodules # Prevent addition of new git submodules
23 |       - id: mixed-line-ending # Replaces or checks mixed line ending
24 | #      - id: no-commit-to-branch # Don't commit to branch
25 |       - id: trailing-whitespace # This hook trims trailing whitespace.
26 |         args: ["--markdown-linebreak-ext=md"]
27 |   # MyPy
28 |   - repo: https://github.com/pre-commit/mirrors-mypy
29 |     rev: v0.782
30 |     hooks:
31 |       - id: mypy
32 |         name: mypy (ds1/raw_to_refined)
33 |         exclude: (docs/|tests/)
34 |         files: ^glue/data_sources/ds1/raw_to_refined/
35 |       - id: mypy
36 |         name: mypy (ds1/refined_to_curated)
37 |         exclude: (docs/|tests/)
38 |         files: ^glue/data_sources/ds1/refined_to_curated/
39 |       - id: mypy
40 |         name: mypy(dummy_job)
41 |         files: ^glue/data_sources/dummy_job/dummy_transition/
42 |         exclude: (docs/|tests/)
43 |       - id: mypy
44 |         name: mypy(shared_lib)
45 |         files: ^glue/shared/glue_shared_lib
46 |         exclude: (docs/|tests/)
47 |   # Black
48 |   - repo: https://github.com/ambv/black
49 |     rev: stable
50 |     hooks:
51 |       - id: black
52 |         args:
53 |           - "-l 100"
54 |   # Bandit
55 |   - repo: https://github.com/PyCQA/bandit
56 |     rev: 1.6.2
57 |     hooks:
58 |       - id: bandit
59 |         exclude: (docker/|tests/)
60 |   # Flake8
61 |   - repo: https://gitlab.com/pycqa/flake8
62 |     rev: 3.8.3
63 |     hooks:
64 |       - id: flake8
65 |         exclude: ^(tests/)
66 |         args:
67 |           - --max-line-length=100
68 |   # Terraform
69 |   # for this hook to work you need to have terraform-docs, TFLint, TFSec installed.
70 |   - repo: git://github.com/antonbabenko/pre-commit-terraform
71 |     rev: v1.43.0
72 |     hooks:
73 | #      - id: terraform_fmt
74 |       - id: terraform_docs
75 |       - id: terraform_tflint
76 |         args:
77 |           - --args=--enable-rule=terraform_deprecated_index
78 |           - --args=--enable-rule=terraform_unused_declarations
79 |           - --args=--enable-rule=terraform_comment_syntax
80 |           - --args=--enable-rule=terraform_documented_outputs
81 |           - --args=--enable-rule=terraform_documented_variables
82 |           - --args=--enable-rule=terraform_typed_variables
83 |           - --args=--enable-rule=terraform_naming_convention
84 | 


--------------------------------------------------------------------------------
/terraform/solution/glue_jobs_dummy.tf:
--------------------------------------------------------------------------------
  1 | ###################### Glue Dummy Jobs for complex workflow demonstration ########################################
  2 | locals {
  3 |   dummy_job_location = "s3://${var.glue_bucket_name}/code/dummy_job/dummy_transition/dummy_transition.py"
  4 | }
  5 | module "dummy_job" {
  6 |   for_each = toset(
  7 |   [
  8 |       "dummy_job_0",
  9 |       "dummy_job_1",
 10 |       "dummy_job_2",
 11 |       "dummy_job_3",
 12 |       "dummy_job_4",
 13 |       "dummy_job_5",
 14 |     ]
 15 |   )
 16 |   source          = "../modules/glue-job/python_shell"
 17 |   name            = each.value
 18 |   script_location = local.dummy_job_location
 19 |   role_arn        = module.glue_role.iam_role_arn
 20 |   default_arguments = {
 21 |     "--job-bookmark-option"      = "job-bookmark-disable"
 22 |     "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 23 |     "--APP_SETTINGS_ENVIRONMENT" = "dev"
 24 |     "--LOG_LEVEL"                = "DEBUG"
 25 |     "--S3_BUCKET" = var.glue_bucket_name
 26 |   }
 27 |   tags              = var.tags
 28 | }
 29 | #
 30 | #module "dummy_job_2" {
 31 | #  source          = "../modules/glue-job/python_shell"
 32 | #  name            = "dummy_job_2"
 33 | #  script_location = local.dummy_job_location
 34 | #  role_arn        = module.glue_role.iam_role_arn
 35 | #  default_arguments = {
 36 | #    "--job-bookmark-option"      = "job-bookmark-disable"
 37 | #    "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 38 | #    "--APP_SETTINGS_ENVIRONMENT" = "dev"
 39 | #    "--LOG_LEVEL"                = "DEBUG"
 40 | #    "--S3_BUCKET" = var.glue_bucket_name
 41 | #  }
 42 | #  tags              = var.tags
 43 | #}
 44 | #
 45 | #module "dummy_job_3" {
 46 | #  source          = "../modules/glue-job/python_shell"
 47 | #  name            = "dummy_job_3"
 48 | #  script_location = local.dummy_job_location
 49 | #  role_arn        = module.glue_role.iam_role_arn
 50 | #  default_arguments = {
 51 | #    "--job-bookmark-option"      = "job-bookmark-disable"
 52 | #    "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 53 | #    "--APP_SETTINGS_ENVIRONMENT" = "dev"
 54 | #    "--LOG_LEVEL"                = "DEBUG"
 55 | #    "--S3_BUCKET" = var.glue_bucket_name
 56 | #  }
 57 | #  tags              = var.tags
 58 | #}
 59 | #
 60 | #module "dummy_job_4" {
 61 | #  source          = "../modules/glue-job/python_shell"
 62 | #  name            = "dummy_job_4"
 63 | #  script_location = local.dummy_job_location
 64 | #  role_arn        = module.glue_role.iam_role_arn
 65 | #  default_arguments = {
 66 | #    "--job-bookmark-option"      = "job-bookmark-disable"
 67 | #    "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 68 | #    "--APP_SETTINGS_ENVIRONMENT" = "dev"
 69 | #    "--LOG_LEVEL"                = "DEBUG"
 70 | #    "--S3_BUCKET" = var.glue_bucket_name
 71 | #  }
 72 | #  tags              = var.tags
 73 | #}
 74 | #
 75 | #module "dummy_job_5" {
 76 | #  source          = "../modules/glue-job/python_shell"
 77 | #  name            = "dummy_job_5"
 78 | #  script_location = local.dummy_job_location
 79 | #  role_arn        = module.glue_role.iam_role_arn
 80 | #  default_arguments = {
 81 | #    "--job-bookmark-option"      = "job-bookmark-disable"
 82 | #    "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 83 | #    "--APP_SETTINGS_ENVIRONMENT" = "dev"
 84 | #    "--LOG_LEVEL"                = "DEBUG"
 85 | #    "--S3_BUCKET" = var.glue_bucket_name
 86 | #  }
 87 | #  tags              = var.tags
 88 | #}
 89 | #
 90 | #module "dummy_job_6" {
 91 | #  source          = "../modules/glue-job/python_shell"
 92 | #  name            = "dummy_job_6"
 93 | #  script_location = "s3://${var.glue_bucket_name}/code/${module.dummy_job_1.job_name}/dummy_transition/dummy_transition.py"
 94 | #  role_arn        = module.glue_role.iam_role_arn
 95 | #  default_arguments = {
 96 | #    "--job-bookmark-option"      = "job-bookmark-disable"
 97 | #    "--TempDir"                  = "s3://${var.glue_bucket_name}/glue-temp"
 98 | #    "--APP_SETTINGS_ENVIRONMENT" = "dev"
 99 | #    "--LOG_LEVEL"                = "DEBUG"
100 | #    "--S3_BUCKET" = var.glue_bucket_name
101 | #  }
102 | #  tags              = var.tags
103 | #}
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # aws-glue-monorepo-style
 2 | 
 3 | An example of AWS Glue Jobs and workflow deployment with terraform in monorepo style.
 4 | 
 5 | To learn more about decisions behind this structure chek out the supporting articles:
 6 | https://dev.to/1oglop1/aws-glue-first-experience-part-1-how-to-run-your-code-3pe3
 7 | 
 8 | ![architecture of this solution](arch_diagram.png)
 9 | (for simplicity this solution uses just 1 bucket and does not deploy database)
10 | 
11 | ## Deployment:
12 | 
13 | Requirements:
14 | 
15 | * AWS Account
16 | * S3 bucket to store terraform state.
17 | * Rename `.evn.example` to `.env` and set the values
18 | * export environment variables from `.env` using command: `set -o allexport; source .env; set +o allexport`
19 | * `docker-compose up -d`
20 | * `docker exec -it glue /bin/bash`
21 | 
22 | Now we are going to work inside the docker container
23 | 
24 | * `make tf-init`
25 | * `make tf-plan`
26 | * `make tf-apply`
27 | * `make jobs-deploy`
28 | 
29 | That's it!
30 | If everything went well you can now go to AWS Glue Console and explore jobs and workflows.
31 | 
32 | Or start workflow from CLI `aws glue start-workflow-run --name etl-workflow--simple`
33 | 
34 | Once you are finished with observations remove everything with  `make tf-destroy`.
35 | 
36 | ## Development
37 | 
38 | With the [release of Glue 2.0 AWS](https://aws.amazon.com/blogs/big-data/developing-aws-glue-etl-jobs-locally-using-a-container/)
39 | released official Glue Docker Image you can use it for local development of glue jobs.
40 | 
41 | example:
42 | 
43 | * `docker exec -it glue /bin/bash` to connect into our container
44 | * `cd /project/glue/data_sources/ds1/raw_to_refined`
45 | * `pip install -r requirements.txt`  
46 | * Run the fist job `python raw_to_refined.py --APP_SETTINGS_ENVIRONMENT=dev --LOG_LEVEL=DEBUG --S3_BUCKET=${TF_VAR_glue_bucket_name}`  
47 | * `cd /project/glue/data_sources/ds1/refined_to_curated`
48 | * Next step requires results from previous stage `raw_to_refined`
49 | * Run the second job `python refined_to_curated.py --APP_SETTINGS_ENVIRONMENT=dev --LOG_LEVEL=DEBUG --S3_BUCKET=${TF_VAR_glue_bucket_name}`
50 | 
51 | If everything went well you should see output like this:
52 | 
53 | ```
54 | 2020-12-23 14:28:43,278 DEBUG    glue_shared.spark_helpers - DF: +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+
55 | |                name|        mfr|       type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight| cups|   rating|
56 | +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+
57 | |              String|Categorical|Categorical|     Int|    Int|Int|   Int|Float|Float|   Int|   Int|     Int|  Int| Float|Float|    Float|
58 | |           100% Bran|          N|          C|      70|      4|  1|   130|   10|    5|     6|   280|      25|    3|     1| 0.33|68.402973|
59 | |   100% Natural Bran|          Q|          C|     120|      3|  5|    15|    2|    8|     8|   135|       0|    3|     1|    1|33.983679|
60 | |            All-Bran|          K|          C|      70|      4|  1|   260|    9|    7|     5|   320|      25|    3|     1| 0.33|59.425505|
61 | |All-Bran with Ext...|          K|          C|      50|      4|  0|   140|   14|    8|     0|   330|      25|    3|     1|  0.5|93.704912|
62 | |      Almond Delight|          R|          C|     110|      2|  2|   200|    1|   14|     8|    -1|      25|    3|     1| 0.75|34.384843|
63 | |Apple Cinnamon Ch...|          G|          C|     110|      2|  2|   180|  1.5| 10.5|    10|    70|      25|    1|     1| 0.75|29.509541|
64 | |         Apple Jacks|          K|          C|     110|      2|  0|   125|    1|   11|    14|    30|      25|    2|     1|    1|33.174094|
65 | |             Basic 4|          G|          C|     130|      3|  2|   210|    2|   18|     8|   100|      25|    3|  1.33| 0.75|37.038562|
66 | |           Bran Chex|          R|          C|      90|      2|  1|   200|    4|   15|     6|   125|      25|    1|     1| 0.67|49.120253|
67 | +--------------------+-----------+-----------+--------+-------+---+------+-----+-----+------+------+--------+-----+------+-----+---------+
68 | only showing top 10 rows
69 | ```
70 | 
71 | Commands above start PySpark inside the container and look for files stored in S3 `<bucket>/ds1/refined`
72 | PS. You should avoid running local PySpark on large datasets!
73 | 
74 | ## Disclaimer
75 | 
76 | Please keep in mind that IAM roles used in this example are very broad and should not be used as is.
77 | 


--------------------------------------------------------------------------------
/dummy_data/cereal.csv:
--------------------------------------------------------------------------------
 1 | name;mfr;type;calories;protein;fat;sodium;fiber;carbo;sugars;potass;vitamins;shelf;weight;cups;rating
 2 | String;Categorical;Categorical;Int;Int;Int;Int;Float;Float;Int;Int;Int;Int;Float;Float;Float
 3 | 100% Bran;N;C;70;4;1;130;10;5;6;280;25;3;1;0.33;68.402973
 4 | 100% Natural Bran;Q;C;120;3;5;15;2;8;8;135;0;3;1;1;33.983679
 5 | All-Bran;K;C;70;4;1;260;9;7;5;320;25;3;1;0.33;59.425505
 6 | All-Bran with Extra Fiber;K;C;50;4;0;140;14;8;0;330;25;3;1;0.5;93.704912
 7 | Almond Delight;R;C;110;2;2;200;1;14;8;-1;25;3;1;0.75;34.384843
 8 | Apple Cinnamon Cheerios;G;C;110;2;2;180;1.5;10.5;10;70;25;1;1;0.75;29.509541
 9 | Apple Jacks;K;C;110;2;0;125;1;11;14;30;25;2;1;1;33.174094
10 | Basic 4;G;C;130;3;2;210;2;18;8;100;25;3;1.33;0.75;37.038562
11 | Bran Chex;R;C;90;2;1;200;4;15;6;125;25;1;1;0.67;49.120253
12 | Bran Flakes;P;C;90;3;0;210;5;13;5;190;25;3;1;0.67;53.313813
13 | Cap'n'Crunch;Q;C;120;1;2;220;0;12;12;35;25;2;1;0.75;18.042851
14 | Cheerios;G;C;110;6;2;290;2;17;1;105;25;1;1;1.25;50.764999
15 | Cinnamon Toast Crunch;G;C;120;1;3;210;0;13;9;45;25;2;1;0.75;19.823573
16 | Clusters;G;C;110;3;2;140;2;13;7;105;25;3;1;0.5;40.400208
17 | Cocoa Puffs;G;C;110;1;1;180;0;12;13;55;25;2;1;1;22.736446
18 | Corn Chex;R;C;110;2;0;280;0;22;3;25;25;1;1;1;41.445019
19 | Corn Flakes;K;C;100;2;0;290;1;21;2;35;25;1;1;1;45.863324
20 | Corn Pops;K;C;110;1;0;90;1;13;12;20;25;2;1;1;35.782791
21 | Count Chocula;G;C;110;1;1;180;0;12;13;65;25;2;1;1;22.396513
22 | Cracklin' Oat Bran;K;C;110;3;3;140;4;10;7;160;25;3;1;0.5;40.448772
23 | Cream of Wheat (Quick);N;H;100;3;0;80;1;21;0;-1;0;2;1;1;64.533816
24 | Crispix;K;C;110;2;0;220;1;21;3;30;25;3;1;1;46.895644
25 | Crispy Wheat & Raisins;G;C;100;2;1;140;2;11;10;120;25;3;1;0.75;36.176196
26 | Double Chex;R;C;100;2;0;190;1;18;5;80;25;3;1;0.75;44.330856
27 | Froot Loops;K;C;110;2;1;125;1;11;13;30;25;2;1;1;32.207582
28 | Frosted Flakes;K;C;110;1;0;200;1;14;11;25;25;1;1;0.75;31.435973
29 | Frosted Mini-Wheats;K;C;100;3;0;0;3;14;7;100;25;2;1;0.8;58.345141
30 | Fruit & Fibre Dates, Walnuts, and Oats;P;C;120;3;2;160;5;12;10;200;25;3;1.25;0.67;40.917047
31 | Fruitful Bran;K;C;120;3;0;240;5;14;12;190;25;3;1.33;0.67;41.015492
32 | Fruity Pebbles;P;C;110;1;1;135;0;13;12;25;25;2;1;0.75;28.025765
33 | Golden Crisp;P;C;100;2;0;45;0;11;15;40;25;1;1;0.88;35.252444
34 | Golden Grahams;G;C;110;1;1;280;0;15;9;45;25;2;1;0.75;23.804043
35 | Grape Nuts Flakes;P;C;100;3;1;140;3;15;5;85;25;3;1;0.88;52.076897
36 | Grape-Nuts;P;C;110;3;0;170;3;17;3;90;25;3;1;0.25;53.371007
37 | Great Grains Pecan;P;C;120;3;3;75;3;13;4;100;25;3;1;0.33;45.811716
38 | Honey Graham Ohs;Q;C;120;1;2;220;1;12;11;45;25;2;1;1;21.871292
39 | Honey Nut Cheerios;G;C;110;3;1;250;1.5;11.5;10;90;25;1;1;0.75;31.072217
40 | Honey-comb;P;C;110;1;0;180;0;14;11;35;25;1;1;1.33;28.742414
41 | Just Right Crunchy  Nuggets;K;C;110;2;1;170;1;17;6;60;100;3;1;1;36.523683
42 | Just Right Fruit & Nut;K;C;140;3;1;170;2;20;9;95;100;3;1.3;0.75;36.471512
43 | Kix;G;C;110;2;1;260;0;21;3;40;25;2;1;1.5;39.241114
44 | Life;Q;C;100;4;2;150;2;12;6;95;25;2;1;0.67;45.328074
45 | Lucky Charms;G;C;110;2;1;180;0;12;12;55;25;2;1;1;26.734515
46 | Maypo;A;H;100;4;1;0;0;16;3;95;25;2;1;1;54.850917
47 | Muesli Raisins, Dates, & Almonds;R;C;150;4;3;95;3;16;11;170;25;3;1;1;37.136863
48 | Muesli Raisins, Peaches, & Pecans;R;C;150;4;3;150;3;16;11;170;25;3;1;1;34.139765
49 | Mueslix Crispy Blend;K;C;160;3;2;150;3;17;13;160;25;3;1.5;0.67;30.313351
50 | Multi-Grain Cheerios;G;C;100;2;1;220;2;15;6;90;25;1;1;1;40.105965
51 | Nut&Honey Crunch;K;C;120;2;1;190;0;15;9;40;25;2;1;0.67;29.924285
52 | Nutri-Grain Almond-Raisin;K;C;140;3;2;220;3;21;7;130;25;3;1.33;0.67;40.692320
53 | Nutri-grain Wheat;K;C;90;3;0;170;3;18;2;90;25;3;1;1;59.642837
54 | Oatmeal Raisin Crisp;G;C;130;3;2;170;1.5;13.5;10;120;25;3;1.25;0.5;30.450843
55 | Post Nat. Raisin Bran;P;C;120;3;1;200;6;11;14;260;25;3;1.33;0.67;37.840594
56 | Product 19;K;C;100;3;0;320;1;20;3;45;100;3;1;1;41.503540
57 | Puffed Rice;Q;C;50;1;0;0;0;13;0;15;0;3;0.5;1;60.756112
58 | Puffed Wheat;Q;C;50;2;0;0;1;10;0;50;0;3;0.5;1;63.005645
59 | Quaker Oat Squares;Q;C;100;4;1;135;2;14;6;110;25;3;1;0.5;49.511874
60 | Quaker Oatmeal;Q;H;100;5;2;0;2.7;-1;-1;110;0;1;1;0.67;50.828392
61 | Raisin Bran;K;C;120;3;1;210;5;14;12;240;25;2;1.33;0.75;39.259197
62 | Raisin Nut Bran;G;C;100;3;2;140;2.5;10.5;8;140;25;3;1;0.5;39.703400
63 | Raisin Squares;K;C;90;2;0;0;2;15;6;110;25;3;1;0.5;55.333142
64 | Rice Chex;R;C;110;1;0;240;0;23;2;30;25;1;1;1.13;41.998933
65 | Rice Krispies;K;C;110;2;0;290;0;22;3;35;25;1;1;1;40.560159
66 | Shredded Wheat;N;C;80;2;0;0;3;16;0;95;0;1;0.83;1;68.235885
67 | Shredded Wheat 'n'Bran;N;C;90;3;0;0;4;19;0;140;0;1;1;0.67;74.472949
68 | Shredded Wheat spoon size;N;C;90;3;0;0;3;20;0;120;0;1;1;0.67;72.801787
69 | Smacks;K;C;110;2;1;70;1;9;15;40;25;2;1;0.75;31.230054
70 | Special K;K;C;110;6;0;230;1;16;3;55;25;1;1;1;53.131324
71 | Strawberry Fruit Wheats;N;C;90;2;0;15;3;15;5;90;25;2;1;1;59.363993
72 | Total Corn Flakes;G;C;110;2;1;200;0;21;3;35;100;3;1;1;38.839746
73 | Total Raisin Bran;G;C;140;3;1;190;4;15;14;230;100;3;1.5;1;28.592785
74 | Total Whole Grain;G;C;100;3;1;200;3;16;3;110;100;3;1;1;46.658844
75 | Triples;G;C;110;2;1;250;0;21;3;60;25;3;1;0.75;39.106174
76 | Trix;G;C;110;1;1;140;0;13;12;25;25;2;1;1;27.753301
77 | Wheat Chex;R;C;100;3;1;230;3;17;3;115;25;1;1;0.67;49.787445
78 | Wheaties;G;C;100;3;1;200;3;17;3;110;25;1;1;1;51.592193
79 | Wheaties Honey Gold;G;C;110;2;1;200;1;16;8;60;25;1;1;0.75;36.187559
80 | 


--------------------------------------------------------------------------------
/glue/shared/glue_shared_lib/src/glue_shared/boto3_helpers.py:
--------------------------------------------------------------------------------
  1 | """Convenient methods requiring boto3 client."""
  2 | import json
  3 | import logging
  4 | from typing import Dict
  5 | from urllib.parse import urlsplit
  6 | 
  7 | from glue_shared.exceptions import ParametersNotFound, JobFailedError, DataNotAvailable
  8 | from glue_shared.helpers import chunked
  9 | 
 10 | LOGGER = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def resolve_ssm_parameters(ssm_client, parameters: Dict) -> Dict:
 14 |     """
 15 |     Resolve multiple SSM parameters from a dict.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     ssm_client
 20 |         boto3.client('ssm')
 21 |     parameters
 22 |         A dictionary of friendly names and parameter names.
 23 | 
 24 |     Examples
 25 |     --------
 26 |     >>> import boto3
 27 |     ... resolve_ssm_parameters(boto3.client('ssm'), {"db_host": "/dev/db/HOST"})
 28 |     {'db_host': 'value'}
 29 | 
 30 |     Returns
 31 |     -------
 32 |     dict
 33 |         The original dict with resolved values instead of ssm paths.
 34 | 
 35 |     """
 36 |     tmp = {value: key for key, value in parameters.items()}
 37 | 
 38 |     valid_parameters = []
 39 |     invalid_parameters = []
 40 | 
 41 |     for chunk in chunked(tuple(tmp.keys()), 10):
 42 |         response = ssm_client.get_parameters(Names=chunk, WithDecryption=True)
 43 |         valid_parameters.extend(response["Parameters"])
 44 |         invalid_parameters.extend(response["InvalidParameters"])
 45 | 
 46 |     if invalid_parameters:
 47 |         raise ParametersNotFound("Unable to get parameters.", *invalid_parameters)
 48 | 
 49 |     tmp.update({param["Name"]: param["Value"] for param in valid_parameters})
 50 | 
 51 |     return {key: tmp[value] for key, value in parameters.items()}
 52 | 
 53 | 
 54 | def get_connection(glue_client, name: str) -> Dict:
 55 |     """
 56 |     Get connection properties.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     glue_client
 61 |         boto3.client('glue')
 62 |     name
 63 |         A connection name.
 64 | 
 65 |     Examples
 66 |     --------
 67 |     >>> import boto3
 68 |     ... get_connection(boto3.client('glue'), "connection-name"})
 69 |     {
 70 |     'NAME': '<string>',
 71 |     'TYPE': '<string>'
 72 |     'JDBC_CONNECTION_URL': '<string>'
 73 |     'PASSSWORD': '<string>',
 74 |     'USERNAME': '<string>,
 75 |     }
 76 | 
 77 |     Returns
 78 |     -------
 79 |         A dictionary of connection properties.
 80 |         Mapping of the dictionary is a simplified version of
 81 |         boto3 response.
 82 |         {
 83 |         'NAME': 'Name',
 84 |         'TYPE': 'ConnectionType'
 85 |         ... then follows ConnectionProperties
 86 |         'JDBC_CONNECTION_URL': '<string>'
 87 |         'PASSSWORD': '<string>',
 88 |         'USERNAME': '<string>,
 89 |         ...if jdbc
 90 |         'HOST': '<string>',
 91 |         'PORT': '<string>',
 92 |         'DATABASE': '<string>',
 93 |         }
 94 | 
 95 |     """
 96 |     response = glue_client.get_connection(Name=name)
 97 | 
 98 |     ret = {
 99 |         "NAME": response["Connection"]["Name"],
100 |         "TYPE": response["Connection"]["ConnectionType"],
101 |         **response["Connection"]["ConnectionProperties"],
102 |     }
103 | 
104 |     if response["Connection"]["ConnectionType"] == "JDBC":
105 |         surl = urlsplit(
106 |             response["Connection"]["ConnectionProperties"]["JDBC_CONNECTION_URL"].lstrip("jdbc:")
107 |         )
108 | 
109 |         ret.update(
110 |             {
111 |                 "HOST": surl.hostname,
112 |                 "PORT": surl.port,
113 |                 "DATABASE": surl.path.lstrip("/"),
114 |             }
115 |         )
116 | 
117 |     return ret
118 | 
119 | 
120 | def gracefully_exit(
121 |     sns_client,
122 |     sns_topic_arn,
123 |     process_results: Dict,
124 |     job_result: str = "PASS",
125 |     message: str = "Job failed.",
126 | ):
127 |     """
128 |     Update workflow status SSM parameter and exit with error if job failed.
129 | 
130 |     Parameters
131 |     ----------
132 |     sns_client
133 |         boto3.client('sns')
134 |     sns_topic_arn
135 |         Workflow notification topic ARN.
136 |     process_results
137 |         A dictionary containing the results of the processing.
138 |         This dict must contain json serialisable object.
139 |     job_result
140 |         Job result FAIL or PASS.  # is not validated here.
141 |     message
142 |         Exit message if job fails.
143 | 
144 |     """
145 |     LOGGER.debug("Sending SNS message")
146 |     LOGGER.debug("Process results: %s", process_results)
147 |     sns_client.publish(TopicArn=sns_topic_arn, Message=json.dumps(process_results))
148 |     if job_result == "FAIL":
149 |         LOGGER.debug("Exiting with message. %s", message)
150 |         raise JobFailedError(message)
151 | 
152 | 
153 | def get_s3_keys(client, bucket_name: str, prefix: str):
154 |     """
155 |     Get keys from S3 bucket by prefix
156 |     Parameters
157 |     ----------
158 |     client
159 |         boto3.client('s3')
160 |     bucket_name
161 |     prefix
162 | 
163 |     Yields
164 |     -------
165 |         list_objects_v2 Response
166 | 
167 |     """
168 |     paginator = client.get_paginator("list_objects_v2")
169 |     response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
170 |     LOGGER.info("Reading S3: s3://%s/%s", bucket_name, prefix)
171 |     for idx, page in enumerate(response_iterator):
172 |         try:
173 |             for response in page["Contents"]:
174 |                 yield response["Key"]
175 |         except KeyError:
176 |             raise DataNotAvailable(f"No data available at: s3://{bucket_name}/{prefix}")
177 | 


--------------------------------------------------------------------------------