├── msk-to-iceberg
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── assets
    │   ├── iceberg-table.png
    │   ├── iceberg-data-level-01.png
    │   ├── iceberg-data-level-02.png
    │   └── iceberg-data-level-03.png
    ├── cdk_stacks
    │   ├── __init__.py
    │   ├── glue_catalog_database.py
    │   ├── s3.py
    │   ├── vpc.py
    │   ├── lakeformation_permissions.py
    │   ├── glue_msk_connection.py
    │   ├── glue_streaming_job.py
    │   ├── glue_job_role.py
    │   ├── msk.py
    │   └── kafka_client_ec2.py
    ├── source.bat
    ├── cdk.json
    ├── src
    │   ├── utils
    │   │   └── gen_fake_data.py
    │   └── main
    │   │   └── python
    │   │       ├── spark_dataframe_insert_iceberg_from_kafka.py
    │   │       ├── spark_sql_insert_overwrite_iceberg_from_kafka.py
    │   │       └── spark_sql_merge_into_iceberg_from_kafka.py
    ├── app.py
    ├── README.md
    └── glue-streaming-data-from-kafka-to-iceberg-table.svg
├── msk-serverless-to-iceberg
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── assets
    │   ├── iceberg-table.png
    │   ├── iceberg-data-level-01.png
    │   ├── iceberg-data-level-02.png
    │   └── iceberg-data-level-03.png
    ├── cdk_stacks
    │   ├── __init__.py
    │   ├── glue_catalog_database.py
    │   ├── s3.py
    │   ├── vpc.py
    │   ├── lakeformation_permissions.py
    │   ├── glue_msk_connection.py
    │   ├── msk_serverless.py
    │   ├── glue_streaming_job.py
    │   ├── glue_job_role.py
    │   └── kafka_client_ec2.py
    ├── source.bat
    ├── cdk.json
    ├── src
    │   ├── utils
    │   │   └── gen_fake_data.py
    │   └── main
    │   │   └── python
    │   │       ├── spark_dataframe_insert_iceberg_from_msk_serverless.py
    │   │       ├── spark_sql_insert_overwrite_iceberg_from_msk_serverless.py
    │   │       └── spark_sql_merge_into_iceberg_from_msk_serverless.py
    ├── app.py
    ├── README.md
    └── glue-streaming-data-from-msk-serverless-to-iceberg-table.svg
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
└── CONTRIBUTING.md


/msk-to-iceberg/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | mimesis==4.1.3 # The last to support Python 3.6 and 3.7
2 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | mimesis==4.1.3 # The last to support Python 3.6 and 3.7
2 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.61.1
2 | constructs>=10.0.0,<11.0.0
3 | 
4 | boto3==1.26.55


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.61.1
2 | constructs>=10.0.0,<11.0.0
3 | 
4 | boto3==1.26.55


--------------------------------------------------------------------------------
/msk-to-iceberg/assets/iceberg-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-to-iceberg/assets/iceberg-table.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | package-lock.json
 4 | __pycache__
 5 | .pytest_cache
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/assets/iceberg-data-level-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-to-iceberg/assets/iceberg-data-level-01.png


--------------------------------------------------------------------------------
/msk-to-iceberg/assets/iceberg-data-level-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-to-iceberg/assets/iceberg-data-level-02.png


--------------------------------------------------------------------------------
/msk-to-iceberg/assets/iceberg-data-level-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-to-iceberg/assets/iceberg-data-level-03.png


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/assets/iceberg-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-serverless-to-iceberg/assets/iceberg-table.png


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/assets/iceberg-data-level-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-serverless-to-iceberg/assets/iceberg-data-level-01.png


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/assets/iceberg-data-level-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-serverless-to-iceberg/assets/iceberg-data-level-02.png


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/assets/iceberg-data-level-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-ingestion-from-kafka-to-apache-iceberg/HEAD/msk-serverless-to-iceberg/assets/iceberg-data-level-03.png


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .vpc import VpcStack
 2 | from .kafka_client_ec2 import KafkaClientEC2InstanceStack
 3 | from .msk import MskStack
 4 | from .glue_job_role import GlueJobRoleStack
 5 | from .glue_msk_connection import GlueMSKConnectionStack
 6 | from .glue_catalog_database import GlueCatalogDatabaseStack
 7 | from .glue_streaming_job import GlueStreamingJobStack
 8 | from .lakeformation_permissions import DataLakePermissionsStack
 9 | from .s3 import S3BucketStack
10 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .vpc import VpcStack
 2 | from .kafka_client_ec2 import KafkaClientEC2InstanceStack
 3 | from .msk_serverless import MskServerlessStack
 4 | from .glue_job_role import GlueJobRoleStack
 5 | from .glue_msk_connection import GlueMSKConnectionStack
 6 | from .glue_catalog_database import GlueCatalogDatabaseStack
 7 | from .glue_streaming_job import GlueStreamingJobStack
 8 | from .lakeformation_permissions import DataLakePermissionsStack
 9 | from .s3 import S3BucketStack
10 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/glue_catalog_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_glue
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class GlueCatalogDatabaseStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     glue_kinesis_table = self.node.try_get_context('glue_job_input_arguments')
20 |     database_name = glue_kinesis_table['--database_name']
21 | 
22 |     cfn_database = aws_glue.CfnDatabase(self, "GlueCfnDatabase",
23 |       catalog_id=cdk.Aws.ACCOUNT_ID,
24 |       database_input=aws_glue.CfnDatabase.DatabaseInputProperty(
25 |         name=database_name
26 |       )
27 |     )
28 |     cfn_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
29 | 
30 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueDatabaseName',
31 |       value=cfn_database.database_input.name)
32 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/glue_catalog_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_glue
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class GlueCatalogDatabaseStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     glue_kinesis_table = self.node.try_get_context('glue_job_input_arguments')
20 |     database_name = glue_kinesis_table['--database_name']
21 | 
22 |     cfn_database = aws_glue.CfnDatabase(self, "GlueCfnDatabase",
23 |       catalog_id=cdk.Aws.ACCOUNT_ID,
24 |       database_input=aws_glue.CfnDatabase.DatabaseInputProperty(
25 |         name=database_name
26 |       )
27 |     )
28 |     cfn_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
29 | 
30 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueDatabaseName',
31 |       value=cfn_database.database_input.name)
32 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | from urllib.parse import urlparse
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_s3 as s3
12 | )
13 | 
14 | from constructs import Construct
15 | 
16 | 
17 | class S3BucketStack(Stack):
18 | 
19 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
20 |     super().__init__(scope, construct_id, **kwargs)
21 | 
22 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
23 |     s3_path = glue_job_input_arguments["--iceberg_s3_path"]
24 |     s3_bucket_name = urlparse(s3_path).netloc
25 | 
26 |     s3_bucket = s3.Bucket(self, "s3bucket",
27 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
28 |       bucket_name=s3_bucket_name)
29 | 
30 |     self.s3_bucket_name = s3_bucket.bucket_name
31 | 
32 |     cdk.CfnOutput(self, f'{self.stack_name}_S3Bucket', value=self.s3_bucket_name)
33 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | from urllib.parse import urlparse
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_s3 as s3
12 | )
13 | 
14 | from constructs import Construct
15 | 
16 | 
17 | class S3BucketStack(Stack):
18 | 
19 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
20 |     super().__init__(scope, construct_id, **kwargs)
21 | 
22 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
23 |     s3_path = glue_job_input_arguments["--iceberg_s3_path"]
24 |     s3_bucket_name = urlparse(s3_path).netloc
25 | 
26 |     s3_bucket = s3.Bucket(self, "s3bucket",
27 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
28 |       bucket_name=s3_bucket_name)
29 | 
30 |     self.s3_bucket_name = s3_bucket.bucket_name
31 | 
32 |     cdk.CfnOutput(self, f'{self.stack_name}_S3Bucket', value=self.s3_bucket_name)
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Glue Streaming Ingestion from Kafka to Apache Iceberg
 2 | 
 3 | This is a collecton of Amazon CDK projects to show how to directly ingest streaming data from Amazon Mananged Service for Apache Kafka (MSK) and MSK Serverless into Apache Iceberg table in S3 with AWS Glue Streaming.
 4 | 
 5 | | Project | Description | Tags |
 6 | |---------|-------------|------|
 7 | | [msk-to-iceberg](./msk-to-iceberg/) | ![glue-streaming-ingestion-from-msk-to-iceberg-arch](./msk-to-iceberg/glue-streaming-data-from-kafka-to-iceberg-table.svg) | AWS Glue Streaming, Managed Service for Apache Kafka (MSK), S3, Apache Iceberg |
 8 | | [msk-serverless-to-iceberg](./msk-serverless-to-iceberg/) | ![glue-streaming-ingestion-from-msk-serverless-to-iceberg-arch](./msk-serverless-to-iceberg/glue-streaming-data-from-msk-serverless-to-iceberg-table.svg) | AWS Glue Streaming, MSK Serverless, S3, Apache Iceberg |
 9 | 
10 | Enjoy!
11 | 
12 | ## Security
13 | 
14 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
15 | 
16 | ## License
17 | 
18 | This library is licensed under the MIT-0 License. See the LICENSE file.
19 | 
20 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
19 |     "@aws-cdk/core:checkSecretUsage": true,
20 |     "@aws-cdk/core:target-partitions": [
21 |       "aws",
22 |       "aws-cn"
23 |     ],
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
29 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
30 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
31 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
32 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
33 |     "@aws-cdk/core:enablePartitionLiterals": true,
34 |     "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true,
35 |     "@aws-cdk/aws-iam:standardizedServicePrincipals": true,
36 |     "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true,
37 |     "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true,
38 |     "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true,
39 |     "@aws-cdk/aws-route53-patters:useCertificate": true,
40 |     "@aws-cdk/customresources:installLatestAwsSdkDefault": false
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
19 |     "@aws-cdk/core:checkSecretUsage": true,
20 |     "@aws-cdk/core:target-partitions": [
21 |       "aws",
22 |       "aws-cn"
23 |     ],
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
29 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
30 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
31 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
32 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
33 |     "@aws-cdk/core:enablePartitionLiterals": true,
34 |     "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true,
35 |     "@aws-cdk/aws-iam:standardizedServicePrincipals": true,
36 |     "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true,
37 |     "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true,
38 |     "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true,
39 |     "@aws-cdk/aws-route53-patters:useCertificate": true,
40 |     "@aws-cdk/customresources:installLatestAwsSdkDefault": false
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/src/utils/gen_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import sys
 6 | import argparse
 7 | from datetime import datetime
 8 | import json
 9 | import random
10 | import time
11 | 
12 | import mimesis
13 | 
14 | # Mimesis 5.0 supports Python 3.8, 3.9, and 3.10.
15 | # The Mimesis 4.1.3 is the last to support Python 3.6 and 3.7
16 | # For more information, see https://mimesis.name/en/latest/changelog.html#version-5-0-0
17 | assert mimesis.__version__ == '4.1.3'
18 | 
19 | from mimesis import locales
20 | from mimesis.schema import Field, Schema
21 | 
22 | random.seed(47)
23 | 
24 | 
25 | def main():
26 |   parser = argparse.ArgumentParser()
27 | 
28 |   parser.add_argument('--max-count', default=10, type=int, help='The max number of records to put.(default: 10)')
29 | 
30 |   options = parser.parse_args()
31 | 
32 |   _CURRENT_YEAR = datetime.now().year
33 |   _NAMES = 'Arica,Burton,Cory,Fernando,Gonzalo,Kenton,Linsey,Micheal,Ricky,Takisha'.split(',')
34 | 
35 |   #XXX: For more information about synthetic data schema, see
36 |   # https://github.com/aws-samples/aws-glue-streaming-etl-blog/blob/master/config/generate_data.py
37 |   _ = Field(locale=locales.EN)
38 | 
39 |   _schema = Schema(schema=lambda: {
40 |     # "name": _("first_name"),
41 |     "name": _("choice", items=_NAMES),
42 |     "age": _("age"),
43 |     "m_time": _("formatted_datetime", fmt="%Y-%m-%d %H:%M:%S", start=_CURRENT_YEAR, end=_CURRENT_YEAR)
44 |   })
45 | 
46 |   cnt = 0
47 |   for record in _schema.create(options.max_count):
48 |     cnt += 1
49 |     partition_key = record['name']
50 |     print(f"{partition_key}\t{json.dumps(record)}")
51 |     time.sleep(random.choices([0.01, 0.03, 0.05, 0.07, 0.1])[-1])
52 | 
53 |   print(f'[INFO] {cnt} records are processed', file=sys.stderr)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |   main()
58 | 
59 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/src/utils/gen_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import sys
 6 | import argparse
 7 | from datetime import datetime
 8 | import json
 9 | import random
10 | import time
11 | 
12 | import mimesis
13 | 
14 | # Mimesis 5.0 supports Python 3.8, 3.9, and 3.10.
15 | # The Mimesis 4.1.3 is the last to support Python 3.6 and 3.7
16 | # For more information, see https://mimesis.name/en/latest/changelog.html#version-5-0-0
17 | assert mimesis.__version__ == '4.1.3'
18 | 
19 | from mimesis import locales
20 | from mimesis.schema import Field, Schema
21 | 
22 | random.seed(47)
23 | 
24 | 
25 | def main():
26 |   parser = argparse.ArgumentParser()
27 | 
28 |   parser.add_argument('--max-count', default=10, type=int, help='The max number of records to put.(default: 10)')
29 | 
30 |   options = parser.parse_args()
31 | 
32 |   _CURRENT_YEAR = datetime.now().year
33 |   _NAMES = 'Arica,Burton,Cory,Fernando,Gonzalo,Kenton,Linsey,Micheal,Ricky,Takisha'.split(',')
34 | 
35 |   #XXX: For more information about synthetic data schema, see
36 |   # https://github.com/aws-samples/aws-glue-streaming-etl-blog/blob/master/config/generate_data.py
37 |   _ = Field(locale=locales.EN)
38 | 
39 |   _schema = Schema(schema=lambda: {
40 |     # "name": _("first_name"),
41 |     "name": _("choice", items=_NAMES),
42 |     "age": _("age"),
43 |     "m_time": _("formatted_datetime", fmt="%Y-%m-%d %H:%M:%S", start=_CURRENT_YEAR, end=_CURRENT_YEAR)
44 |   })
45 | 
46 |   cnt = 0
47 |   for record in _schema.create(options.max_count):
48 |     cnt += 1
49 |     partition_key = record['name']
50 |     print(f"{partition_key}\t{json.dumps(record)}")
51 |     time.sleep(random.choices([0.01, 0.03, 0.05, 0.07, 0.1])[-1])
52 | 
53 |   print(f'[INFO] {cnt} records are processed', file=sys.stderr)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |   main()
58 | 
59 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import aws_cdk as cdk
 5 | 
 6 | from cdk_stacks import (
 7 |   VpcStack,
 8 |   MskStack,
 9 |   KafkaClientEC2InstanceStack,
10 |   GlueJobRoleStack,
11 |   GlueMSKConnectionStack,
12 |   GlueCatalogDatabaseStack,
13 |   GlueStreamingJobStack,
14 |   DataLakePermissionsStack,
15 |   S3BucketStack
16 | )
17 | 
18 | 
19 | APP_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
20 |   region=os.getenv('CDK_DEFAULT_REGION'))
21 | 
22 | app = cdk.App()
23 | 
24 | s3_bucket = S3BucketStack(app, 'KafkaToIcebergS3Path')
25 | 
26 | vpc_stack = VpcStack(app, 'KafkaToIcebergStackVpc',
27 |   env=APP_ENV
28 | )
29 | vpc_stack.add_dependency(s3_bucket)
30 | 
31 | msk_stack = MskStack(app, 'KafkaAsGlueStreamingJobDataSource',
32 |   vpc_stack.vpc,
33 |   env=APP_ENV
34 | )
35 | msk_stack.add_dependency(vpc_stack)
36 | 
37 | kafka_client_ec2_stack = KafkaClientEC2InstanceStack(app, 'KafkaClientEC2Instance',
38 |   vpc_stack.vpc,
39 |   msk_stack.sg_msk_client,
40 |   msk_stack.msk_cluster_name,
41 |   env=APP_ENV
42 | )
43 | kafka_client_ec2_stack.add_dependency(msk_stack)
44 | 
45 | glue_msk_connection = GlueMSKConnectionStack(app, 'GlueMSKConnection',
46 |   vpc_stack.vpc,
47 |   msk_stack.sg_msk_client,
48 |   env=APP_ENV
49 | )
50 | glue_msk_connection.add_dependency(msk_stack)
51 | 
52 | glue_job_role = GlueJobRoleStack(app, 'GlueStreamingMSKtoIcebergJobRole')
53 | glue_job_role.add_dependency(msk_stack)
54 | 
55 | glue_database = GlueCatalogDatabaseStack(app, 'GlueIcebergDatabase')
56 | 
57 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'GrantLFPermissionsOnGlueJobRole',
58 |   glue_job_role.glue_job_role
59 | )
60 | grant_lake_formation_permissions.add_dependency(glue_database)
61 | grant_lake_formation_permissions.add_dependency(glue_job_role)
62 | 
63 | glue_streaming_job = GlueStreamingJobStack(app, 'GlueStreamingJobMSKtoIceberg',
64 |   glue_job_role.glue_job_role,
65 |   glue_msk_connection.msk_connection_info
66 | )
67 | glue_streaming_job.add_dependency(grant_lake_formation_permissions)
68 | 
69 | app.synth()
70 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import aws_cdk as cdk
 5 | 
 6 | from cdk_stacks import (
 7 |   VpcStack,
 8 |   MskServerlessStack,
 9 |   KafkaClientEC2InstanceStack,
10 |   GlueJobRoleStack,
11 |   GlueMSKConnectionStack,
12 |   GlueCatalogDatabaseStack,
13 |   GlueStreamingJobStack,
14 |   DataLakePermissionsStack,
15 |   S3BucketStack
16 | )
17 | 
18 | 
19 | APP_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
20 |   region=os.getenv('CDK_DEFAULT_REGION'))
21 | 
22 | app = cdk.App()
23 | 
24 | s3_bucket = S3BucketStack(app, 'MSKServerlessToIcebergS3Path')
25 | 
26 | vpc_stack = VpcStack(app, 'MSKServerlessToIcebergStackVpc',
27 |   env=APP_ENV
28 | )
29 | vpc_stack.add_dependency(s3_bucket)
30 | 
31 | msk_stack = MskServerlessStack(app, 'MSKServerlessAsGlueStreamingJobDataSource',
32 |   vpc_stack.vpc,
33 |   env=APP_ENV
34 | )
35 | msk_stack.add_dependency(s3_bucket)
36 | 
37 | kafka_client_ec2_stack = KafkaClientEC2InstanceStack(app, 'MSKServerlessClientEC2Instance',
38 |   vpc_stack.vpc,
39 |   msk_stack.sg_msk_client,
40 |   msk_stack.msk_cluster_name,
41 |   env=APP_ENV
42 | )
43 | kafka_client_ec2_stack.add_dependency(msk_stack)
44 | 
45 | glue_msk_connection = GlueMSKConnectionStack(app, 'GlueMSKServerlessConnection',
46 |   vpc_stack.vpc,
47 |   msk_stack.sg_msk_client,
48 |   env=APP_ENV
49 | )
50 | glue_msk_connection.add_dependency(msk_stack)
51 | 
52 | glue_job_role = GlueJobRoleStack(app, 'GlueStreamingMSKServerlessToIcebergJobRole',
53 |   msk_stack.msk_cluster_name,
54 | )
55 | glue_job_role.add_dependency(msk_stack)
56 | 
57 | glue_database = GlueCatalogDatabaseStack(app, 'GlueIcebergDatabase')
58 | 
59 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'GrantLFPermissionsOnGlueJobRole',
60 |   glue_job_role.glue_job_role
61 | )
62 | grant_lake_formation_permissions.add_dependency(glue_database)
63 | grant_lake_formation_permissions.add_dependency(glue_job_role)
64 | 
65 | glue_streaming_job = GlueStreamingJobStack(app, 'GlueStreamingJobMSKServerlessToIceberg',
66 |   glue_job_role.glue_job_role,
67 |   glue_msk_connection.msk_connection_info
68 | )
69 | glue_streaming_job.add_dependency(grant_lake_formation_permissions)
70 | 
71 | app.synth()
72 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/vpc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | import aws_cdk as cdk
 7 | 
 8 | from aws_cdk import (
 9 |   Stack,
10 |   aws_ec2,
11 | )
12 | from constructs import Construct
13 | 
14 | 
15 | class VpcStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     #XXX: For creating this CDK Stack in the existing VPC,
21 |     # remove comments from the below codes and
22 |     # comments out vpc = aws_ec2.Vpc(..) codes,
23 |     # then pass -c vpc_name=your-existing-vpc to cdk command
24 |     # for example,
25 |     # cdk -c vpc_name=your-existing-vpc syth
26 |     #
27 |     if str(os.environ.get('USE_DEFAULT_VPC', 'false')).lower() == 'true':
28 |       vpc_name = self.node.try_get_context("vpc_name") or "default"
29 |       self.vpc = aws_ec2.Vpc.from_lookup(self, "MSKVpc",
30 |         is_default=True,
31 |         vpc_name=vpc_name)
32 |     else:
33 |       #XXX: To use more than 2 AZs, be sure to specify the account and region on your stack.
34 |       #XXX: https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_ec2/Vpc.html
35 |       self.vpc = aws_ec2.Vpc(self, "MSKServerlessVpc",
36 |         ip_addresses=aws_ec2.IpAddresses.cidr("10.0.0.0/21"),
37 |         max_azs=3,
38 | 
39 |         # 'subnetConfiguration' specifies the "subnet groups" to create.
40 |         # Every subnet group will have a subnet for each AZ, so this
41 |         # configuration will create `2 groups × 3 AZs = 6` subnets.
42 |         subnet_configuration=[
43 |           {
44 |             "cidrMask": 24,
45 |             "name": "Public",
46 |             "subnetType": aws_ec2.SubnetType.PUBLIC,
47 |           },
48 |           {
49 |             "cidrMask": 24,
50 |             "name": "Private",
51 |             "subnetType": aws_ec2.SubnetType.PRIVATE_WITH_EGRESS
52 |           }
53 |         ],
54 |         gateway_endpoints={
55 |           "S3": aws_ec2.GatewayVpcEndpointOptions(
56 |             service=aws_ec2.GatewayVpcEndpointAwsService.S3
57 |           )
58 |         }
59 |       )
60 | 
61 | 
62 |     #XXX: The Name field of every Export member must be specified and
63 |     # consist only of alphanumeric characters,colons, or hyphens.
64 |     cdk.CfnOutput(self, 'VPCID', value=self.vpc.vpc_id,
65 |       export_name=f'{self.stack_name}-VPCID')
66 | 
67 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/vpc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | import aws_cdk as cdk
 7 | 
 8 | from aws_cdk import (
 9 |   Stack,
10 |   aws_ec2,
11 | )
12 | from constructs import Construct
13 | 
14 | 
15 | class VpcStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     #XXX: For creating this CDK Stack in the existing VPC,
21 |     # remove comments from the below codes and
22 |     # comments out vpc = aws_ec2.Vpc(..) codes,
23 |     # then pass -c vpc_name=your-existing-vpc to cdk command
24 |     # for example,
25 |     # cdk -c vpc_name=your-existing-vpc syth
26 |     #
27 |     if str(os.environ.get('USE_DEFAULT_VPC', 'false')).lower() == 'true':
28 |       vpc_name = self.node.try_get_context("vpc_name") or "default"
29 |       self.vpc = aws_ec2.Vpc.from_lookup(self, "MSKServerlessVpc",
30 |         is_default=True,
31 |         vpc_name=vpc_name)
32 |     else:
33 |       #XXX: To use more than 2 AZs, be sure to specify the account and region on your stack.
34 |       #XXX: https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_ec2/Vpc.html
35 |       self.vpc = aws_ec2.Vpc(self, "MSKServerlessVpc",
36 |         ip_addresses=aws_ec2.IpAddresses.cidr("10.0.0.0/21"),
37 |         max_azs=3,
38 | 
39 |         # 'subnetConfiguration' specifies the "subnet groups" to create.
40 |         # Every subnet group will have a subnet for each AZ, so this
41 |         # configuration will create `2 groups × 3 AZs = 6` subnets.
42 |         subnet_configuration=[
43 |           {
44 |             "cidrMask": 24,
45 |             "name": "Public",
46 |             "subnetType": aws_ec2.SubnetType.PUBLIC,
47 |           },
48 |           {
49 |             "cidrMask": 24,
50 |             "name": "Private",
51 |             "subnetType": aws_ec2.SubnetType.PRIVATE_WITH_EGRESS
52 |           }
53 |         ],
54 |         gateway_endpoints={
55 |           "S3": aws_ec2.GatewayVpcEndpointOptions(
56 |             service=aws_ec2.GatewayVpcEndpointAwsService.S3
57 |           )
58 |         }
59 |       )
60 | 
61 | 
62 |     #XXX: The Name field of every Export member must be specified and
63 |     # consist only of alphanumeric characters,colons, or hyphens.
64 |     cdk.CfnOutput(self, 'VPCID', value=self.vpc.vpc_id,
65 |       export_name=f'{self.stack_name}-VPCID')
66 | 
67 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/lakeformation_permissions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lakeformation
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class DataLakePermissionsStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, glue_job_role, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
20 |     database_name = glue_job_input_arguments["--database_name"]
21 | 
22 |     #XXXX: The role assumed by cdk is not a data lake administrator.
23 |     # So, deploying PrincipalPermissions meets the error such as:
24 |     # "Resource does not exist or requester is not authorized to access requested permissions."
25 |     # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
26 |     # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
27 |     cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
28 |       admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
29 |         data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
30 |       )]
31 |     )
32 | 
33 |     cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions",
34 |       permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
35 |       permissions_with_grant_option=[],
36 |       principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
37 |         data_lake_principal_identifier=glue_job_role.role_arn
38 |       ),
39 |       resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
40 |         #XXX: Can't specify a TableWithColumns resource and a Table resource
41 |         table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
42 |           catalog_id=cdk.Aws.ACCOUNT_ID,
43 |           database_name=database_name,
44 |           # name="ALL_TABLES",
45 |           table_wildcard={}
46 |         )
47 |       )
48 |     )
49 |     cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
50 | 
51 |     #XXX: In order to keep resource destruction order,
52 |     # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
53 |     cfn_principal_permissions.add_dependency(cfn_data_lake_settings)
54 | 
55 |     cdk.CfnOutput(self, f'{self.stack_name}_Principal',
56 |       value=cfn_principal_permissions.attr_principal_identifier)
57 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/lakeformation_permissions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lakeformation
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class DataLakePermissionsStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, glue_job_role, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
20 |     database_name = glue_job_input_arguments["--database_name"]
21 | 
22 |     #XXXX: The role assumed by cdk is not a data lake administrator.
23 |     # So, deploying PrincipalPermissions meets the error such as:
24 |     # "Resource does not exist or requester is not authorized to access requested permissions."
25 |     # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
26 |     # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
27 |     cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
28 |       admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
29 |         data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
30 |       )]
31 |     )
32 | 
33 |     cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions",
34 |       permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
35 |       permissions_with_grant_option=[],
36 |       principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
37 |         data_lake_principal_identifier=glue_job_role.role_arn
38 |       ),
39 |       resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
40 |         #XXX: Can't specify a TableWithColumns resource and a Table resource
41 |         table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
42 |           catalog_id=cdk.Aws.ACCOUNT_ID,
43 |           database_name=database_name,
44 |           # name="ALL_TABLES",
45 |           table_wildcard={}
46 |         )
47 |       )
48 |     )
49 |     cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
50 | 
51 |     #XXX: In order to keep resource destruction order,
52 |     # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
53 |     cfn_principal_permissions.add_dependency(cfn_data_lake_settings)
54 | 
55 |     cdk.CfnOutput(self, f'{self.stack_name}_Principal',
56 |       value=cfn_principal_permissions.attr_principal_identifier)
57 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/glue_msk_connection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import boto3
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_ec2,
12 |   aws_glue,
13 |   aws_msk
14 | )
15 | from constructs import Construct
16 | 
17 | 
18 | class GlueMSKConnectionStack(Stack):
19 | 
20 |   def __init__(self, scope: Construct, construct_id: str, vpc, sg_msk_client, **kwargs) -> None:
21 |     super().__init__(scope, construct_id, **kwargs)
22 | 
23 |     sg_glue_cluster = aws_ec2.SecurityGroup(self, 'GlueClusterSecurityGroup',
24 |       vpc=vpc,
25 |       allow_all_outbound=True,
26 |       description='security group for Amazon Glue Cluster',
27 |       security_group_name='glue-cluster-sg'
28 |     )
29 |     sg_glue_cluster.add_ingress_rule(peer=sg_glue_cluster, connection=aws_ec2.Port.all_tcp(),
30 |       description='inter-communication between glue cluster nodes')
31 |     cdk.Tags.of(sg_glue_cluster).add('Name', 'glue-cluster-sg')
32 | 
33 |     msk_cluster_name = self.node.try_get_context('msk').get('cluster_name')
34 |     msk_client = boto3.client('kafka', region_name=vpc.env.region)
35 |     response = msk_client.list_clusters(ClusterNameFilter=msk_cluster_name)
36 |     msk_cluster_info_list = response['ClusterInfoList']
37 |     if not msk_cluster_info_list:
38 |       kafka_bootstrap_servers = "localhost:9094"
39 |     else:
40 |       msk_cluster_arn = msk_cluster_info_list[0]['ClusterArn']
41 |       msk_brokers = msk_client.get_bootstrap_brokers(ClusterArn=msk_cluster_arn)
42 |       kafka_bootstrap_servers = msk_brokers['BootstrapBrokerString']
43 |       assert kafka_bootstrap_servers
44 | 
45 |     connection_properties = {
46 |       "KAFKA_BOOTSTRAP_SERVERS": kafka_bootstrap_servers,
47 |       "KAFKA_SSL_ENABLED": "false"
48 |     }
49 | 
50 |     subnet = vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).subnets[0]
51 | 
52 |     connection_input_property = aws_glue.CfnConnection.ConnectionInputProperty(
53 |       connection_type="KAFKA",
54 |       connection_properties=connection_properties,
55 |       name="msk-connector",
56 |       physical_connection_requirements=aws_glue.CfnConnection.PhysicalConnectionRequirementsProperty(
57 |         security_group_id_list=[sg_msk_client.security_group_id, sg_glue_cluster.security_group_id],
58 |         subnet_id=subnet.subnet_id,
59 |         availability_zone=subnet.availability_zone
60 |       )
61 |     )
62 | 
63 |     msk_connection = aws_glue.CfnConnection(self, 'GlueMSKConnector',
64 |       catalog_id=cdk.Aws.ACCOUNT_ID,
65 |       connection_input=connection_input_property
66 |     )
67 | 
68 |     self.msk_connection_info = msk_connection.connection_input
69 | 
70 |     cdk.CfnOutput(self, f'{self.stack_name}-MSKConnectorName', value=self.msk_connection_info.name)
71 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/glue_msk_connection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import boto3
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_ec2,
12 |   aws_glue,
13 |   aws_msk
14 | )
15 | from constructs import Construct
16 | 
17 | 
18 | class GlueMSKConnectionStack(Stack):
19 | 
20 |   def __init__(self, scope: Construct, construct_id: str, vpc, sg_msk_client, **kwargs) -> None:
21 |     super().__init__(scope, construct_id, **kwargs)
22 | 
23 |     sg_glue_cluster = aws_ec2.SecurityGroup(self, 'GlueClusterSecurityGroup',
24 |       vpc=vpc,
25 |       allow_all_outbound=True,
26 |       description='security group for Amazon Glue Cluster',
27 |       security_group_name='glue-cluster-sg'
28 |     )
29 |     sg_glue_cluster.add_ingress_rule(peer=sg_glue_cluster, connection=aws_ec2.Port.all_tcp(),
30 |       description='inter-communication between glue cluster nodes')
31 |     cdk.Tags.of(sg_glue_cluster).add('Name', 'glue-cluster-sg')
32 | 
33 |     msk_cluster_name = self.node.try_get_context('msk_cluster_name')
34 |     msk_client = boto3.client('kafka', region_name=vpc.env.region)
35 |     response = msk_client.list_clusters_v2(ClusterNameFilter=msk_cluster_name)
36 |     msk_cluster_info_list = response['ClusterInfoList']
37 |     if not msk_cluster_info_list:
38 |       kafka_bootstrap_servers = "localhost:9094"
39 |     else:
40 |       msk_cluster_arn = msk_cluster_info_list[0]['ClusterArn']
41 |       msk_brokers = msk_client.get_bootstrap_brokers(ClusterArn=msk_cluster_arn)
42 |       kafka_bootstrap_servers = msk_brokers['BootstrapBrokerStringSaslIam']
43 |       assert kafka_bootstrap_servers
44 | 
45 |     connection_properties = {
46 |       "KAFKA_BOOTSTRAP_SERVERS": kafka_bootstrap_servers,
47 |       "KAFKA_SSL_ENABLED": "false"
48 |     }
49 | 
50 |     subnet = vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).subnets[0]
51 | 
52 |     connection_input_property = aws_glue.CfnConnection.ConnectionInputProperty(
53 |       connection_type="KAFKA",
54 |       connection_properties=connection_properties,
55 |       name="msk-serverless-connector",
56 |       physical_connection_requirements=aws_glue.CfnConnection.PhysicalConnectionRequirementsProperty(
57 |         security_group_id_list=[sg_msk_client.security_group_id, sg_glue_cluster.security_group_id],
58 |         subnet_id=subnet.subnet_id,
59 |         availability_zone=subnet.availability_zone
60 |       )
61 |     )
62 | 
63 |     msk_connection = aws_glue.CfnConnection(self, 'GlueMSKConnector',
64 |       catalog_id=cdk.Aws.ACCOUNT_ID,
65 |       connection_input=connection_input_property
66 |     )
67 | 
68 |     self.msk_connection_info = msk_connection.connection_input
69 | 
70 |     cdk.CfnOutput(self, f'{self.stack_name}-MSKConnectorName', value=self.msk_connection_info.name)
71 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/msk_serverless.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import random
 6 | import string
 7 | 
 8 | import aws_cdk as cdk
 9 | 
10 | from aws_cdk import (
11 |   Stack,
12 |   aws_ec2,
13 |   aws_msk
14 | )
15 | from constructs import Construct
16 | 
17 | random.seed(47)
18 | 
19 | 
20 | class MskServerlessStack(Stack):
21 | 
22 |   def __init__(self, scope: Construct, construct_id: str, vpc, **kwargs) -> None:
23 |     super().__init__(scope, construct_id, **kwargs)
24 | 
25 |     msk_cluster_name = self.node.try_get_context("msk_cluster_name")
26 | 
27 |     MSK_CLIENT_SG_NAME = 'msk-client-sg-{}'.format(''.join(random.sample((string.ascii_lowercase), k=5)))
28 |     sg_msk_client = aws_ec2.SecurityGroup(self, 'KafkaClientSecurityGroup',
29 |       vpc=vpc,
30 |       allow_all_outbound=True,
31 |       description='security group for Amazon MSK client',
32 |       security_group_name=MSK_CLIENT_SG_NAME
33 |     )
34 |     cdk.Tags.of(sg_msk_client).add('Name', MSK_CLIENT_SG_NAME)
35 | 
36 |     MSK_CLUSTER_SG_NAME = 'msk-cluster-sg-{}'.format(''.join(random.sample((string.ascii_lowercase), k=5)))
37 |     sg_msk_cluster = aws_ec2.SecurityGroup(self, 'MSKSecurityGroup',
38 |       vpc=vpc,
39 |       allow_all_outbound=True,
40 |       description='security group for Amazon MSK Cluster',
41 |       security_group_name=MSK_CLUSTER_SG_NAME
42 |     )
43 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(9098),
44 |       description='msk client security group')
45 |     cdk.Tags.of(sg_msk_cluster).add('Name', MSK_CLUSTER_SG_NAME)
46 | 
47 |     msk_serverless_cluster = aws_msk.CfnServerlessCluster(self, "MSKServerlessCfnCluster",
48 |       #XXX: A serverless cluster must use SASL/IAM authentication
49 |       client_authentication=aws_msk.CfnServerlessCluster.ClientAuthenticationProperty(
50 |         sasl=aws_msk.CfnServerlessCluster.SaslProperty(
51 |           iam=aws_msk.CfnServerlessCluster.IamProperty(
52 |             enabled=True
53 |           )
54 |         )
55 |       ),
56 |       cluster_name=msk_cluster_name,
57 |       vpc_configs=[aws_msk.CfnServerlessCluster.VpcConfigProperty(
58 |         subnet_ids=vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).subnet_ids,
59 |         security_groups=[sg_msk_client.security_group_id, sg_msk_cluster.security_group_id]
60 |       )]
61 |     )
62 | 
63 |     self.sg_msk_client = sg_msk_client
64 |     self.msk_cluster_name = msk_serverless_cluster.cluster_name
65 |     self.msk_cluster_arn = msk_serverless_cluster.attr_arn
66 | 
67 |     cdk.CfnOutput(self, f'{self.stack_name}-SecurityGroupId',
68 |       value=self.sg_msk_client.security_group_id)
69 |     cdk.CfnOutput(self, f'{self.stack_name}-ClusterName',
70 |       value=self.msk_cluster_name)
71 |     cdk.CfnOutput(self, f'{self.stack_name}-ClusterArn',
72 |       value=self.msk_cluster_arn)
73 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/glue_streaming_job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_glue,
10 |   aws_s3 as s3,
11 | )
12 | from constructs import Construct
13 | 
14 | 
15 | class GlueStreamingJobStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, glue_job_role, msk_connection_info, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     glue_assets_s3_bucket_name = self.node.try_get_context('glue_assets_s3_bucket_name')
21 |     glue_job_script_file_name = self.node.try_get_context('glue_job_script_file_name')
22 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
23 | 
24 |     msk_connection_name = msk_connection_info.name
25 |     kafka_bootstrap_servers = msk_connection_info.connection_properties['KAFKA_BOOTSTRAP_SERVERS']
26 | 
27 |     glue_job_default_arguments = {
28 |       "--enable-metrics": "true",
29 |       "--enable-spark-ui": "true",
30 |       "--spark-event-logs-path": f"s3://{glue_assets_s3_bucket_name}/sparkHistoryLogs/",
31 |       "--enable-job-insights": "false",
32 |       "--enable-glue-datacatalog": "true",
33 |       "--enable-continuous-cloudwatch-log": "true",
34 |       "--job-bookmark-option": "job-bookmark-disable",
35 |       "--job-language": "python",
36 |       "--TempDir": f"s3://{glue_assets_s3_bucket_name}/temporary/",
37 |       "--kafka_connection_name": msk_connection_name,
38 |       "--kafka_bootstrap_servers": kafka_bootstrap_servers,
39 |     }
40 | 
41 |     glue_job_default_arguments.update(glue_job_input_arguments)
42 | 
43 |     glue_job_name = self.node.try_get_context('glue_job_name')
44 | 
45 |     glue_connections_name = self.node.try_get_context('glue_connections_name')
46 | 
47 |     glue_cfn_job = aws_glue.CfnJob(self, "GlueStreamingETLJob",
48 |       command=aws_glue.CfnJob.JobCommandProperty(
49 |         name="gluestreaming",
50 |         python_version="3",
51 |         script_location="s3://{glue_assets}/scripts/{glue_job_script_file_name}".format(
52 |           glue_assets=glue_assets_s3_bucket_name,
53 |           glue_job_script_file_name=glue_job_script_file_name
54 |         )
55 |       ),
56 |       role=glue_job_role.role_arn,
57 | 
58 |       #XXX: Set only AllocatedCapacity or MaxCapacity
59 |       # Do not set Allocated Capacity if using Worker Type and Number of Workers
60 |       # allocated_capacity=2,
61 |       connections=aws_glue.CfnJob.ConnectionsListProperty(
62 |         connections=[glue_connections_name, msk_connection_name]
63 |       ),
64 |       default_arguments=glue_job_default_arguments,
65 |       description="This job loads the data from MSK to Apache Iceberg table in S3.",
66 |       execution_property=aws_glue.CfnJob.ExecutionPropertyProperty(
67 |         max_concurrent_runs=1
68 |       ),
69 |       #XXX: check AWS Glue Version in https://docs.aws.amazon.com/glue/latest/dg/add-job.html#create-job
70 |       glue_version="3.0",
71 |       #XXX: Do not set Max Capacity if using Worker Type and Number of Workers
72 |       # max_capacity=2,
73 |       max_retries=0,
74 |       name=glue_job_name,
75 |       # notification_property=aws_glue.CfnJob.NotificationPropertyProperty(
76 |       #   notify_delay_after=10 # 10 minutes
77 |       # ),
78 |       number_of_workers=2,
79 |       timeout=2880,
80 |       worker_type="G.1X" # ['Standard' | 'G.1X' | 'G.2X' | 'G.025x']
81 |     )
82 | 
83 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobName', value=glue_cfn_job.name)
84 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=glue_job_role.role_arn)
85 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/glue_streaming_job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_glue,
10 |   aws_s3 as s3,
11 | )
12 | from constructs import Construct
13 | 
14 | 
15 | class GlueStreamingJobStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, glue_job_role, msk_connection_info, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     glue_assets_s3_bucket_name = self.node.try_get_context('glue_assets_s3_bucket_name')
21 |     glue_job_script_file_name = self.node.try_get_context('glue_job_script_file_name')
22 |     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
23 | 
24 |     msk_connection_name = msk_connection_info.name
25 |     kafka_bootstrap_servers = msk_connection_info.connection_properties['KAFKA_BOOTSTRAP_SERVERS']
26 | 
27 |     glue_job_default_arguments = {
28 |       "--enable-metrics": "true",
29 |       "--enable-spark-ui": "true",
30 |       "--spark-event-logs-path": f"s3://{glue_assets_s3_bucket_name}/sparkHistoryLogs/",
31 |       "--enable-job-insights": "false",
32 |       "--enable-glue-datacatalog": "true",
33 |       "--enable-continuous-cloudwatch-log": "true",
34 |       "--job-bookmark-option": "job-bookmark-disable",
35 |       "--job-language": "python",
36 |       "--TempDir": f"s3://{glue_assets_s3_bucket_name}/temporary/",
37 |       "--kafka_connection_name": msk_connection_name,
38 |       "--kafka_bootstrap_servers": kafka_bootstrap_servers,
39 |     }
40 | 
41 |     glue_job_default_arguments.update(glue_job_input_arguments)
42 | 
43 |     glue_job_name = self.node.try_get_context('glue_job_name')
44 | 
45 |     glue_connections_name = self.node.try_get_context('glue_connections_name')
46 | 
47 |     glue_cfn_job = aws_glue.CfnJob(self, "GlueStreamingETLJob",
48 |       command=aws_glue.CfnJob.JobCommandProperty(
49 |         name="gluestreaming",
50 |         python_version="3",
51 |         script_location="s3://{glue_assets}/scripts/{glue_job_script_file_name}".format(
52 |           glue_assets=glue_assets_s3_bucket_name,
53 |           glue_job_script_file_name=glue_job_script_file_name
54 |         )
55 |       ),
56 |       role=glue_job_role.role_arn,
57 | 
58 |       #XXX: Set only AllocatedCapacity or MaxCapacity
59 |       # Do not set Allocated Capacity if using Worker Type and Number of Workers
60 |       # allocated_capacity=2,
61 |       connections=aws_glue.CfnJob.ConnectionsListProperty(
62 |         connections=[glue_connections_name, msk_connection_name]
63 |       ),
64 |       default_arguments=glue_job_default_arguments,
65 |       description="This job loads the data from MSK to Apache Iceberg table in S3.",
66 |       execution_property=aws_glue.CfnJob.ExecutionPropertyProperty(
67 |         max_concurrent_runs=1
68 |       ),
69 |       #XXX: check AWS Glue Version in https://docs.aws.amazon.com/glue/latest/dg/add-job.html#create-job
70 |       glue_version="3.0",
71 |       #XXX: Do not set Max Capacity if using Worker Type and Number of Workers
72 |       # max_capacity=2,
73 |       max_retries=0,
74 |       name=glue_job_name,
75 |       # notification_property=aws_glue.CfnJob.NotificationPropertyProperty(
76 |       #   notify_delay_after=10 # 10 minutes
77 |       # ),
78 |       number_of_workers=2,
79 |       timeout=2880,
80 |       worker_type="G.1X" # ['Standard' | 'G.1X' | 'G.2X' | 'G.025x']
81 |     )
82 | 
83 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobName', value=glue_cfn_job.name)
84 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=glue_job_role.role_arn)
85 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/glue_job_role.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_iam
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | 
 14 | class GlueJobRoleStack(Stack):
 15 | 
 16 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
 17 |     super().__init__(scope, construct_id, **kwargs)
 18 | 
 19 |     glue_job_role_policy_doc = aws_iam.PolicyDocument()
 20 |     glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 21 |       "sid": "AWSGlueJobDynamoDBAccess",
 22 |       "effect": aws_iam.Effect.ALLOW,
 23 |       #XXX: The ARN will be formatted as follows:
 24 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 25 |       "resources": [self.format_arn(service="dynamodb", resource="table", resource_name="*")],
 26 |       "actions": [
 27 |         "dynamodb:BatchGetItem",
 28 |         "dynamodb:DescribeStream",
 29 |         "dynamodb:DescribeTable",
 30 |         "dynamodb:GetItem",
 31 |         "dynamodb:Query",
 32 |         "dynamodb:Scan",
 33 |         "dynamodb:BatchWriteItem",
 34 |         "dynamodb:CreateTable",
 35 |         "dynamodb:DeleteTable",
 36 |         "dynamodb:DeleteItem",
 37 |         "dynamodb:UpdateTable",
 38 |         "dynamodb:UpdateItem",
 39 |         "dynamodb:PutItem"
 40 |       ]
 41 |     }))
 42 | 
 43 |     glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 44 |       "sid": "AWSGlueJobS3Access",
 45 |       "effect": aws_iam.Effect.ALLOW,
 46 |       #XXX: The ARN will be formatted as follows:
 47 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 48 |       "resources": ["*"],
 49 |       "actions": [
 50 |         "s3:GetBucketLocation",
 51 |         "s3:ListBucket",
 52 |         "s3:GetBucketAcl",
 53 |         "s3:GetObject",
 54 |         "s3:PutObject",
 55 |         "s3:DeleteObject"
 56 |       ]
 57 |     }))
 58 | 
 59 |     glue_job_role = aws_iam.Role(self, 'GlueJobRole',
 60 |       role_name='GlueStreamingJobRole-MSK2Iceberg',
 61 |       assumed_by=aws_iam.ServicePrincipal('glue.amazonaws.com'),
 62 |       inline_policies={
 63 |         'aws_glue_job_role_policy': glue_job_role_policy_doc
 64 |       },
 65 |       managed_policies=[
 66 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole'),
 67 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMReadOnlyAccess'),
 68 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'),
 69 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AWSGlueConsoleFullAccess'),
 70 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonMSKReadOnlyAccess'),
 71 |         # aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess')
 72 |       ]
 73 |     )
 74 | 
 75 |     #XXX: When creating a notebook with a role, that role is then passed to interactive sessions
 76 |     # so that the same role can be used in both places.
 77 |     # As such, the `iam:PassRole` permission needs to be part of the role's policy.
 78 |     # More info at: https://docs.aws.amazon.com/glue/latest/ug/notebook-getting-started.html
 79 |     #
 80 |     glue_job_role.add_to_policy(aws_iam.PolicyStatement(**{
 81 |       "sid": "AWSGlueJobIAMPassRole",
 82 |       "effect": aws_iam.Effect.ALLOW,
 83 |       #XXX: The ARN will be formatted as follows:
 84 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 85 |       "resources": [self.format_arn(service="iam", region="", resource="role", resource_name=glue_job_role.role_name)],
 86 |       "conditions": {
 87 |         "StringLike": {
 88 |           "iam:PassedToService": [
 89 |             "glue.amazonaws.com"
 90 |           ]
 91 |         }
 92 |       },
 93 |       "actions": [
 94 |         "iam:PassRole"
 95 |       ]
 96 |     }))
 97 | 
 98 |     self.glue_job_role = glue_job_role
 99 | 
100 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRole', value=self.glue_job_role.role_name)
101 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=self.glue_job_role.role_arn)
102 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/src/main/python/spark_dataframe_insert_iceberg_from_kafka.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | 
  8 | from awsglue.transforms import *
  9 | from awsglue.utils import getResolvedOptions
 10 | from pyspark.context import SparkContext
 11 | from awsglue.context import GlueContext
 12 | from awsglue.job import Job
 13 | from awsglue import DynamicFrame
 14 | 
 15 | from pyspark.conf import SparkConf
 16 | from pyspark.sql.types import *
 17 | from pyspark.sql.functions import (
 18 |   col,
 19 |   from_json,
 20 |   to_timestamp
 21 | )
 22 | 
 23 | 
 24 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 25 |   'catalog',
 26 |   'database_name',
 27 |   'table_name',
 28 |   'primary_key',
 29 |   'kafka_topic_name',
 30 |   'starting_offsets_of_kafka_topic',
 31 |   'kafka_connection_name',
 32 |   'kafka_bootstrap_servers',
 33 |   'iceberg_s3_path',
 34 |   'lock_table_name',
 35 |   'aws_region',
 36 |   'window_size'
 37 | ])
 38 | 
 39 | CATALOG = args['catalog']
 40 | 
 41 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 42 | 
 43 | DATABASE = args['database_name']
 44 | TABLE_NAME = args['table_name']
 45 | PRIMARY_KEY = args['primary_key']
 46 | 
 47 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 48 | 
 49 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 50 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 51 | KAFKA_BOOTSTRAP_SERVERS = args['kafka_bootstrap_servers']
 52 | 
 53 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 54 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 55 | 
 56 | AWS_REGION = args['aws_region']
 57 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 58 | 
 59 | def setSparkIcebergConf() -> SparkConf:
 60 |   conf_list = [
 61 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 62 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 63 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 65 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 66 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 67 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 68 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 69 |   ]
 70 |   spark_conf = SparkConf().setAll(conf_list)
 71 |   return spark_conf
 72 | 
 73 | # Set the Spark + Glue context
 74 | conf = setSparkIcebergConf()
 75 | sc = SparkContext(conf=conf)
 76 | glueContext = GlueContext(sc)
 77 | spark = glueContext.spark_session
 78 | job = Job(glueContext)
 79 | job.init(args['JOB_NAME'], args)
 80 | 
 81 | options_read = {
 82 |   "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
 83 |   "subscribe": KAFKA_TOPIC_NAME,
 84 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC
 85 | }
 86 | 
 87 | schema = StructType([
 88 |   StructField("name", StringType(), False),
 89 |   StructField("age", IntegerType(), True),
 90 |   StructField("m_time", StringType(), False),
 91 | ])
 92 | 
 93 | streaming_data = spark.readStream.format("kafka").options(**options_read).load()
 94 | 
 95 | stream_data_df = streaming_data \
 96 |     .select(from_json(col("value").cast("string"), schema).alias("source_table")) \
 97 |     .select("source_table.*") \
 98 |     .withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
 99 | 
100 | table_id = f"{CATALOG}.{DATABASE}.{TABLE_NAME}"
101 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
102 | 
103 | #XXX: Writing against partitioned table
104 | # https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table
105 | # Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets
106 | query = stream_data_df.writeStream \
107 |     .format("iceberg") \
108 |     .outputMode("append") \
109 |     .trigger(processingTime=WINDOW_SIZE) \
110 |     .option("path", table_id) \
111 |     .option("fanout-enabled", "true") \
112 |     .option("checkpointLocation", checkpointPath) \
113 |     .start()
114 | 
115 | query.awaitTermination()
116 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/src/main/python/spark_dataframe_insert_iceberg_from_msk_serverless.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | 
  8 | from awsglue.transforms import *
  9 | from awsglue.utils import getResolvedOptions
 10 | from pyspark.context import SparkContext
 11 | from awsglue.context import GlueContext
 12 | from awsglue.job import Job
 13 | from awsglue import DynamicFrame
 14 | 
 15 | from pyspark.conf import SparkConf
 16 | from pyspark.sql.types import *
 17 | from pyspark.sql.functions import (
 18 |   col,
 19 |   from_json,
 20 |   to_timestamp
 21 | )
 22 | 
 23 | 
 24 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 25 |   'catalog',
 26 |   'database_name',
 27 |   'table_name',
 28 |   'primary_key',
 29 |   'kafka_topic_name',
 30 |   'starting_offsets_of_kafka_topic',
 31 |   'kafka_connection_name',
 32 |   'kafka_bootstrap_servers',
 33 |   'iceberg_s3_path',
 34 |   'lock_table_name',
 35 |   'aws_region',
 36 |   'window_size'
 37 | ])
 38 | 
 39 | CATALOG = args['catalog']
 40 | 
 41 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 42 | 
 43 | DATABASE = args['database_name']
 44 | TABLE_NAME = args['table_name']
 45 | PRIMARY_KEY = args['primary_key']
 46 | 
 47 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 48 | 
 49 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 50 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 51 | KAFKA_BOOTSTRAP_SERVERS = args['kafka_bootstrap_servers']
 52 | 
 53 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 54 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 55 | 
 56 | AWS_REGION = args['aws_region']
 57 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 58 | 
 59 | def setSparkIcebergConf() -> SparkConf:
 60 |   conf_list = [
 61 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 62 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 63 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 65 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 66 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 67 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 68 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 69 |   ]
 70 |   spark_conf = SparkConf().setAll(conf_list)
 71 |   return spark_conf
 72 | 
 73 | # Set the Spark + Glue context
 74 | conf = setSparkIcebergConf()
 75 | sc = SparkContext(conf=conf)
 76 | glueContext = GlueContext(sc)
 77 | spark = glueContext.spark_session
 78 | job = Job(glueContext)
 79 | job.init(args['JOB_NAME'], args)
 80 | 
 81 | options_read = {
 82 |   "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
 83 |   "subscribe": KAFKA_TOPIC_NAME,
 84 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC,
 85 |   "kafka.security.protocol": "SASL_SSL",
 86 |   "kafka.sasl.mechanism": "AWS_MSK_IAM",
 87 |   "kafka.sasl.jaas.config": "software.amazon.msk.auth.iam.IAMLoginModule required;",
 88 |   "kafka.sasl.client.callback.handler.class": "software.amazon.msk.auth.iam.IAMClientCallbackHandler"
 89 | }
 90 | 
 91 | schema = StructType([
 92 |   StructField("name", StringType(), False),
 93 |   StructField("age", IntegerType(), True),
 94 |   StructField("m_time", StringType(), False),
 95 | ])
 96 |                                                                             
 97 | streaming_data = spark.readStream.format("kafka").options(**options_read).load()
 98 | 
 99 | stream_data_df = streaming_data \
100 |     .select(from_json(col("value").cast("string"), schema).alias("source_table")) \
101 |     .select("source_table.*") \
102 |     .withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
103 | 
104 | table_id = f"{CATALOG}.{DATABASE}.{TABLE_NAME}"
105 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
106 | 
107 | #XXX: Writing against partitioned table
108 | # https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table
109 | # Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets
110 | query = stream_data_df.writeStream \
111 |     .format("iceberg") \
112 |     .outputMode("append") \
113 |     .trigger(processingTime=WINDOW_SIZE) \
114 |     .option("path", table_id) \
115 |     .option("fanout-enabled", "true") \
116 |     .option("checkpointLocation", checkpointPath) \
117 |     .start()
118 | 
119 | query.awaitTermination()
120 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/src/main/python/spark_sql_insert_overwrite_iceberg_from_kafka.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | import traceback
  8 | 
  9 | from awsglue.transforms import *
 10 | from awsglue.utils import getResolvedOptions
 11 | from pyspark.context import SparkContext
 12 | from awsglue.context import GlueContext
 13 | from awsglue.job import Job
 14 | from awsglue import DynamicFrame
 15 | 
 16 | from pyspark.conf import SparkConf
 17 | from pyspark.sql import DataFrame, Row
 18 | from pyspark.sql.window import Window
 19 | from pyspark.sql.types import *
 20 | from pyspark.sql.functions import (
 21 |   col,
 22 |   desc,
 23 |   row_number,
 24 |   to_timestamp
 25 | )
 26 | 
 27 | 
 28 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 29 |   'catalog',
 30 |   'database_name',
 31 |   'table_name',
 32 |   'primary_key',
 33 |   'kafka_topic_name',
 34 |   'starting_offsets_of_kafka_topic',
 35 |   'kafka_connection_name',
 36 |   'iceberg_s3_path',
 37 |   'lock_table_name',
 38 |   'aws_region',
 39 |   'window_size'
 40 | ])
 41 | 
 42 | CATALOG = args['catalog']
 43 | 
 44 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 45 | 
 46 | DATABASE = args['database_name']
 47 | TABLE_NAME = args['table_name']
 48 | PRIMARY_KEY = args['primary_key']
 49 | 
 50 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 51 | 
 52 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 53 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 54 | 
 55 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 56 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 57 | 
 58 | AWS_REGION = args['aws_region']
 59 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 60 | 
 61 | def setSparkIcebergConf() -> SparkConf:
 62 |   conf_list = [
 63 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 65 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 66 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 67 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 68 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 69 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 70 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 71 |   ]
 72 |   spark_conf = SparkConf().setAll(conf_list)
 73 |   return spark_conf
 74 | 
 75 | # Set the Spark + Glue context
 76 | conf = setSparkIcebergConf()
 77 | sc = SparkContext(conf=conf)
 78 | glueContext = GlueContext(sc)
 79 | spark = glueContext.spark_session
 80 | job = Job(glueContext)
 81 | job.init(args['JOB_NAME'], args)
 82 | 
 83 | kafka_options = {
 84 |   "connectionName": KAFKA_CONNECTION_NAME,
 85 |   "topicName": KAFKA_TOPIC_NAME,
 86 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC,
 87 |   "inferSchema": "true",
 88 |   "classification": "json"
 89 | }
 90 | 
 91 | streaming_data = glueContext.create_data_frame.from_options(
 92 |   connection_type="kafka",
 93 |   connection_options=kafka_options,
 94 |   transformation_ctx="kafka_df"
 95 | )
 96 | 
 97 | def processBatch(data_frame, batch_id):
 98 |   if data_frame.count() > 0:
 99 |     stream_data_dynf = DynamicFrame.fromDF(
100 |       data_frame, glueContext, "from_data_frame"
101 |     )
102 | 
103 |     _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0")
104 | 
105 |     #XXX: Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation
106 |     window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time"))
107 |     stream_data_df = stream_data_dynf.toDF()
108 |     stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
109 |     upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \
110 |       .filter(col("row") == 1).drop("row") \
111 |       .select(_df.schema.names)
112 | 
113 |     upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert")
114 |     # print(f"Table '{TABLE_NAME}' is inserting overwrite...")
115 | 
116 |     sql_query = f"""
117 |     INSERT OVERWRITE {CATALOG}.{DATABASE}.{TABLE_NAME} SELECT * FROM {TABLE_NAME}_upsert
118 |     """
119 |     try:
120 |       spark.sql(sql_query)
121 |     except Exception as ex:
122 |       traceback.print_exc()
123 |       raise ex
124 | 
125 | 
126 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
127 | 
128 | glueContext.forEachBatch(
129 |   frame=streaming_data,
130 |   batch_function=processBatch,
131 |   options={
132 |     "windowSize": WINDOW_SIZE,
133 |     "checkpointLocation": checkpointPath,
134 |   }
135 | )
136 | 
137 | job.commit()
138 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/src/main/python/spark_sql_merge_into_iceberg_from_kafka.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | import traceback
  8 | 
  9 | from awsglue.transforms import *
 10 | from awsglue.utils import getResolvedOptions
 11 | from awsglue.context import GlueContext
 12 | from awsglue.job import Job
 13 | from awsglue import DynamicFrame
 14 | 
 15 | from pyspark.context import SparkContext
 16 | from pyspark.conf import SparkConf
 17 | from pyspark.sql import DataFrame, Row
 18 | from pyspark.sql.window import Window
 19 | from pyspark.sql.functions import (
 20 |   col,
 21 |   desc,
 22 |   row_number,
 23 |   to_timestamp
 24 | )
 25 | 
 26 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 27 |   'catalog',
 28 |   'database_name',
 29 |   'table_name',
 30 |   'primary_key',
 31 |   'kafka_topic_name',
 32 |   'starting_offsets_of_kafka_topic',
 33 |   'kafka_connection_name',
 34 |   'iceberg_s3_path',
 35 |   'lock_table_name',
 36 |   'aws_region',
 37 |   'window_size'
 38 | ])
 39 | 
 40 | CATALOG = args['catalog']
 41 | 
 42 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 43 | 
 44 | DATABASE = args['database_name']
 45 | TABLE_NAME = args['table_name']
 46 | PRIMARY_KEY = args['primary_key']
 47 | 
 48 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 49 | 
 50 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 51 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 52 | 
 53 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 54 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 55 | 
 56 | AWS_REGION = args['aws_region']
 57 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 58 | 
 59 | def setSparkIcebergConf() -> SparkConf:
 60 |   conf_list = [
 61 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 62 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 63 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 65 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 66 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 67 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 68 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 69 |   ]
 70 |   spark_conf = SparkConf().setAll(conf_list)
 71 |   return spark_conf
 72 | 
 73 | # Set the Spark + Glue context
 74 | conf = setSparkIcebergConf()
 75 | sc = SparkContext(conf=conf)
 76 | glueContext = GlueContext(sc)
 77 | spark = glueContext.spark_session
 78 | job = Job(glueContext)
 79 | job.init(args['JOB_NAME'], args)
 80 | 
 81 | kafka_options = {
 82 |   "connectionName": KAFKA_CONNECTION_NAME,
 83 |   "topicName": KAFKA_TOPIC_NAME,
 84 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC,
 85 |   "inferSchema": "true",
 86 |   "classification": "json"
 87 | }
 88 | 
 89 | streaming_data = glueContext.create_data_frame.from_options(
 90 |   connection_type="kafka",
 91 |   connection_options=kafka_options,
 92 |   transformation_ctx="kafka_df"
 93 | )
 94 | 
 95 | def processBatch(data_frame, batch_id):
 96 |   if data_frame.count() > 0:
 97 |     stream_data_dynf = DynamicFrame.fromDF(
 98 |       data_frame, glueContext, "from_data_frame"
 99 |     )
100 | 
101 |     tables_df = spark.sql(f"SHOW TABLES IN {CATALOG}.{DATABASE}")
102 |     table_list = tables_df.select('tableName').rdd.flatMap(lambda x: x).collect()
103 |     if f"{TABLE_NAME}" not in table_list:
104 |       print(f"Table {TABLE_NAME} doesn't exist in {CATALOG}.{DATABASE}.")
105 |     else:
106 |       _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0")
107 | 
108 |       #XXX: Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation
109 |       window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time"))
110 |       stream_data_df = stream_data_dynf.toDF()
111 |       stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
112 |       upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \
113 |         .filter(col("row") == 1).drop("row") \
114 |         .select(_df.schema.names)
115 | 
116 |       upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert")
117 |       # print(f"Table '{TABLE_NAME}' is upserting...")
118 | 
119 |       try:
120 |         spark.sql(f"""MERGE INTO {CATALOG}.{DATABASE}.{TABLE_NAME} t
121 |           USING {TABLE_NAME}_upsert s ON s.{PRIMARY_KEY} = t.{PRIMARY_KEY}
122 |           WHEN MATCHED THEN UPDATE SET *
123 |           WHEN NOT MATCHED THEN INSERT *
124 |           """)
125 |       except Exception as ex:
126 |         traceback.print_exc()
127 |         raise ex
128 | 
129 | 
130 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
131 | 
132 | glueContext.forEachBatch(
133 |   frame=streaming_data,
134 |   batch_function=processBatch,
135 |   options={
136 |     "windowSize": WINDOW_SIZE,
137 |     "checkpointLocation": checkpointPath,
138 |   }
139 | )
140 | 
141 | job.commit()
142 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/src/main/python/spark_sql_insert_overwrite_iceberg_from_msk_serverless.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | import traceback
  8 | 
  9 | from awsglue.transforms import *
 10 | from awsglue.utils import getResolvedOptions
 11 | from pyspark.context import SparkContext
 12 | from awsglue.context import GlueContext
 13 | from awsglue.job import Job
 14 | from awsglue import DynamicFrame
 15 | 
 16 | from pyspark.conf import SparkConf
 17 | from pyspark.sql import DataFrame, Row
 18 | from pyspark.sql.window import Window
 19 | from pyspark.sql.types import *
 20 | from pyspark.sql.functions import (
 21 |   col,
 22 |   desc,
 23 |   row_number,
 24 |   to_timestamp
 25 | )
 26 | 
 27 | 
 28 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 29 |   'catalog',
 30 |   'database_name',
 31 |   'table_name',
 32 |   'primary_key',
 33 |   'kafka_topic_name',
 34 |   'starting_offsets_of_kafka_topic',
 35 |   'kafka_connection_name',
 36 |   'iceberg_s3_path',
 37 |   'lock_table_name',
 38 |   'aws_region',
 39 |   'window_size'
 40 | ])
 41 | 
 42 | CATALOG = args['catalog']
 43 | 
 44 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 45 | 
 46 | DATABASE = args['database_name']
 47 | TABLE_NAME = args['table_name']
 48 | PRIMARY_KEY = args['primary_key']
 49 | 
 50 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 51 | 
 52 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 53 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 54 | 
 55 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 56 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 57 | 
 58 | AWS_REGION = args['aws_region']
 59 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 60 | 
 61 | def setSparkIcebergConf() -> SparkConf:
 62 |   conf_list = [
 63 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 65 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 66 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 67 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 68 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 69 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 70 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 71 |   ]
 72 |   spark_conf = SparkConf().setAll(conf_list)
 73 |   return spark_conf
 74 | 
 75 | # Set the Spark + Glue context
 76 | conf = setSparkIcebergConf()
 77 | sc = SparkContext(conf=conf)
 78 | glueContext = GlueContext(sc)
 79 | spark = glueContext.spark_session
 80 | job = Job(glueContext)
 81 | job.init(args['JOB_NAME'], args)
 82 | 
 83 | kafka_options = {
 84 |   "connectionName": KAFKA_CONNECTION_NAME,
 85 |   "topicName": KAFKA_TOPIC_NAME,
 86 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC,
 87 |   "inferSchema": "true",
 88 |   "classification": "json",
 89 | 
 90 |   #XXX: the properties below are required for IAM Access control for MSK Serverless
 91 |   "kafka.security.protocol": "SASL_SSL",
 92 |   "kafka.sasl.mechanism": "AWS_MSK_IAM",
 93 |   "kafka.sasl.jaas.config": "software.amazon.msk.auth.iam.IAMLoginModule required;",
 94 |   "kafka.sasl.client.callback.handler.class": "software.amazon.msk.auth.iam.IAMClientCallbackHandler"
 95 | }
 96 | 
 97 | streaming_data = glueContext.create_data_frame.from_options(
 98 |   connection_type="kafka",
 99 |   connection_options=kafka_options,
100 |   transformation_ctx="kafka_df"
101 | )
102 | 
103 | def processBatch(data_frame, batch_id):
104 |   if data_frame.count() > 0:
105 |     stream_data_dynf = DynamicFrame.fromDF(
106 |       data_frame, glueContext, "from_data_frame"
107 |     )
108 | 
109 |     _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0")
110 | 
111 |     #XXX: Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation
112 |     window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time"))
113 |     stream_data_df = stream_data_dynf.toDF()
114 |     stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
115 |     upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \
116 |       .filter(col("row") == 1).drop("row") \
117 |       .select(_df.schema.names)
118 | 
119 |     upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert")
120 |     # print(f"Table '{TABLE_NAME}' is inserting overwrite...")
121 | 
122 |     sql_query = f"""
123 |     INSERT OVERWRITE {CATALOG}.{DATABASE}.{TABLE_NAME} SELECT * FROM {TABLE_NAME}_upsert
124 |     """
125 |     try:
126 |       spark.sql(sql_query)
127 |     except Exception as ex:
128 |       traceback.print_exc()
129 |       raise ex
130 | 
131 | 
132 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
133 | 
134 | glueContext.forEachBatch(
135 |   frame=streaming_data,
136 |   batch_function=processBatch,
137 |   options={
138 |     "windowSize": WINDOW_SIZE,
139 |     "checkpointLocation": checkpointPath,
140 |   }
141 | )
142 | 
143 | job.commit()
144 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/msk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import re
  6 | import random
  7 | import string
  8 | 
  9 | import aws_cdk as cdk
 10 | 
 11 | from aws_cdk import (
 12 |   Stack,
 13 |   aws_ec2,
 14 |   aws_msk
 15 | )
 16 | from constructs import Construct
 17 | 
 18 | random.seed(43)
 19 | 
 20 | 
 21 | class MskStack(Stack):
 22 | 
 23 |   def __init__(self, scope: Construct, construct_id: str, vpc, **kwargs) -> None:
 24 |     super().__init__(scope, construct_id, **kwargs)
 25 | 
 26 |     msk_config = self.node.try_get_context('msk')
 27 | 
 28 |     MSK_CLUSTER_NAME = msk_config['cluster_name']
 29 |     assert len(MSK_CLUSTER_NAME) <= 64 and re.fullmatch(r'[a-zA-Z]+[a-zA-Z0-9-]*', MSK_CLUSTER_NAME)
 30 | 
 31 |     # Supported Apache Kafka versions:
 32 |     #  https://docs.aws.amazon.com/msk/latest/developerguide/supported-kafka-versions.html
 33 |     KAFA_VERSION = msk_config.get('kafka_version', '2.8.1')
 34 | 
 35 |     KAFA_BROKER_INSTANCE_TYPE = msk_config.get('broker_instance_type', 'kafka.m5.large')
 36 |     KAFA_NUMBER_OF_BROKER_NODES = int(msk_config.get('number_of_broker_nodes', 3))
 37 | 
 38 |     KAFA_BROKER_EBS_VOLUME_SIZE = int(msk_config.get('broker_ebs_volume_size', 100))
 39 |     assert (1 <= KAFA_BROKER_EBS_VOLUME_SIZE and KAFA_BROKER_EBS_VOLUME_SIZE <= 16384)
 40 | 
 41 |     MSK_CLIENT_SG_NAME = 'use-msk-sg-{}'.format(''.join(random.sample((string.ascii_lowercase), k=5)))
 42 |     sg_msk_client = aws_ec2.SecurityGroup(self, 'KafkaClientSecurityGroup',
 43 |       vpc=vpc,
 44 |       allow_all_outbound=True,
 45 |       description='security group for Amazon MSK client',
 46 |       security_group_name=MSK_CLIENT_SG_NAME
 47 |     )
 48 |     cdk.Tags.of(sg_msk_client).add('Name', MSK_CLIENT_SG_NAME)
 49 | 
 50 |     MSK_CLUSTER_SG_NAME = 'msk-sg-{}'.format(''.join(random.sample((string.ascii_lowercase), k=5)))
 51 |     sg_msk_cluster = aws_ec2.SecurityGroup(self, 'MSKSecurityGroup',
 52 |       vpc=vpc,
 53 |       allow_all_outbound=True,
 54 |       description='security group for Amazon MSK Cluster',
 55 |       security_group_name=MSK_CLUSTER_SG_NAME
 56 |     )
 57 |     # For more information about the numbers of the ports that Amazon MSK uses to communicate with client machines,
 58 |     # see https://docs.aws.amazon.com/msk/latest/developerguide/port-info.html
 59 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(2181),
 60 |       description='allow msk client to communicate with Apache ZooKeeper in plaintext')
 61 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(2182),
 62 |       description='allow msk client to communicate with Apache ZooKeeper by using TLS encryption')
 63 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(9092),
 64 |       description='allow msk client to communicate with brokers in plaintext')
 65 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(9094),
 66 |       description='allow msk client to communicate with brokers by using TLS encryption')
 67 |     sg_msk_cluster.add_ingress_rule(peer=sg_msk_client, connection=aws_ec2.Port.tcp(9098),
 68 |       description='msk client security group')
 69 |     cdk.Tags.of(sg_msk_cluster).add('Name', MSK_CLUSTER_SG_NAME)
 70 | 
 71 |     msk_broker_ebs_storage_info = aws_msk.CfnCluster.EBSStorageInfoProperty(volume_size=KAFA_BROKER_EBS_VOLUME_SIZE)
 72 | 
 73 |     msk_broker_storage_info = aws_msk.CfnCluster.StorageInfoProperty(
 74 |       ebs_storage_info=msk_broker_ebs_storage_info
 75 |     )
 76 | 
 77 |     msk_broker_node_group_info = aws_msk.CfnCluster.BrokerNodeGroupInfoProperty(
 78 |       client_subnets=vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).subnet_ids,
 79 |       instance_type=KAFA_BROKER_INSTANCE_TYPE,
 80 |       security_groups=[sg_msk_client.security_group_id, sg_msk_cluster.security_group_id],
 81 |       storage_info=msk_broker_storage_info
 82 |     )
 83 | 
 84 |     msk_encryption_info = aws_msk.CfnCluster.EncryptionInfoProperty(
 85 |       encryption_in_transit=aws_msk.CfnCluster.EncryptionInTransitProperty(
 86 |         client_broker='TLS_PLAINTEXT',
 87 |         in_cluster=True
 88 |       )
 89 |     )
 90 | 
 91 |     msk_cluster = aws_msk.CfnCluster(self, 'AWSKafkaCluster',
 92 |       broker_node_group_info=msk_broker_node_group_info,
 93 |       cluster_name=MSK_CLUSTER_NAME,
 94 |       #XXX: Supported Apache Kafka versions
 95 |       # https://docs.aws.amazon.com/msk/latest/developerguide/supported-kafka-versions.html
 96 |       kafka_version=KAFA_VERSION,
 97 |       number_of_broker_nodes=KAFA_NUMBER_OF_BROKER_NODES,
 98 |       encryption_info=msk_encryption_info,
 99 |       enhanced_monitoring='PER_TOPIC_PER_BROKER'
100 |     )
101 | 
102 |     self.sg_msk_client = sg_msk_client
103 |     self.msk_cluster_name = msk_cluster.cluster_name
104 | 
105 |     cdk.CfnOutput(self, f'{self.stack_name}-MSKClusterName', value=msk_cluster.cluster_name, export_name=f'{self.stack_name}-MSKClusterName')
106 |     cdk.CfnOutput(self, f'{self.stack_name}-MSKClusterArn', value=msk_cluster.attr_arn, export_name=f'{self.stack_name}-MSKClusterArn')
107 |     cdk.CfnOutput(self, f'{self.stack_name}-MSKVersion', value=msk_cluster.kafka_version, export_name=f'{self.stack_name}-MSKVersion')
108 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/src/main/python/spark_sql_merge_into_iceberg_from_msk_serverless.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import sys
  7 | import traceback
  8 | 
  9 | from awsglue.transforms import *
 10 | from awsglue.utils import getResolvedOptions
 11 | from awsglue.context import GlueContext
 12 | from awsglue.job import Job
 13 | from awsglue import DynamicFrame
 14 | 
 15 | from pyspark.context import SparkContext
 16 | from pyspark.conf import SparkConf
 17 | from pyspark.sql import DataFrame, Row
 18 | from pyspark.sql.window import Window
 19 | from pyspark.sql.functions import (
 20 |   col,
 21 |   desc,
 22 |   row_number,
 23 |   to_timestamp
 24 | )
 25 | 
 26 | args = getResolvedOptions(sys.argv, ['JOB_NAME',
 27 |   'catalog',
 28 |   'database_name',
 29 |   'table_name',
 30 |   'primary_key',
 31 |   'kafka_topic_name',
 32 |   'starting_offsets_of_kafka_topic',
 33 |   'kafka_connection_name',
 34 |   'iceberg_s3_path',
 35 |   'lock_table_name',
 36 |   'aws_region',
 37 |   'window_size'
 38 | ])
 39 | 
 40 | CATALOG = args['catalog']
 41 | 
 42 | ICEBERG_S3_PATH = args['iceberg_s3_path']
 43 | 
 44 | DATABASE = args['database_name']
 45 | TABLE_NAME = args['table_name']
 46 | PRIMARY_KEY = args['primary_key']
 47 | 
 48 | DYNAMODB_LOCK_TABLE = args['lock_table_name']
 49 | 
 50 | KAFKA_TOPIC_NAME = args['kafka_topic_name']
 51 | KAFKA_CONNECTION_NAME = args['kafka_connection_name']
 52 | 
 53 | #XXX: starting_offsets_of_kafka_topic: ['latest', 'earliest']
 54 | STARTING_OFFSETS_OF_KAFKA_TOPIC = args.get('starting_offsets_of_kafka_topic', 'latest')
 55 | 
 56 | AWS_REGION = args['aws_region']
 57 | WINDOW_SIZE = args.get('window_size', '100 seconds')
 58 | 
 59 | def setSparkIcebergConf() -> SparkConf:
 60 |   conf_list = [
 61 |     (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"),
 62 |     (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH),
 63 |     (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
 64 |     (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
 65 |     (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"),
 66 |     (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE),
 67 |     ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
 68 |     ("spark.sql.iceberg.handle-timestamp-without-timezone", "true")
 69 |   ]
 70 |   spark_conf = SparkConf().setAll(conf_list)
 71 |   return spark_conf
 72 | 
 73 | # Set the Spark + Glue context
 74 | conf = setSparkIcebergConf()
 75 | sc = SparkContext(conf=conf)
 76 | glueContext = GlueContext(sc)
 77 | spark = glueContext.spark_session
 78 | job = Job(glueContext)
 79 | job.init(args['JOB_NAME'], args)
 80 | 
 81 | kafka_options = {
 82 |   "connectionName": KAFKA_CONNECTION_NAME,
 83 |   "topicName": KAFKA_TOPIC_NAME,
 84 |   "startingOffsets": STARTING_OFFSETS_OF_KAFKA_TOPIC,
 85 |   "inferSchema": "true",
 86 |   "classification": "json",
 87 | 
 88 |   #XXX: the properties below are required for IAM Access control for MSK Serverless
 89 |   "kafka.security.protocol": "SASL_SSL",
 90 |   "kafka.sasl.mechanism": "AWS_MSK_IAM",
 91 |   "kafka.sasl.jaas.config": "software.amazon.msk.auth.iam.IAMLoginModule required;",
 92 |   "kafka.sasl.client.callback.handler.class": "software.amazon.msk.auth.iam.IAMClientCallbackHandler"
 93 | }
 94 | 
 95 | streaming_data = glueContext.create_data_frame.from_options(
 96 |   connection_type="kafka",
 97 |   connection_options=kafka_options,
 98 |   transformation_ctx="kafka_df"
 99 | )
100 | 
101 | def processBatch(data_frame, batch_id):
102 |   if data_frame.count() > 0:
103 |     stream_data_dynf = DynamicFrame.fromDF(
104 |       data_frame, glueContext, "from_data_frame"
105 |     )
106 | 
107 |     tables_df = spark.sql(f"SHOW TABLES IN {CATALOG}.{DATABASE}")
108 |     table_list = tables_df.select('tableName').rdd.flatMap(lambda x: x).collect()
109 |     if f"{TABLE_NAME}" not in table_list:
110 |       print(f"Table {TABLE_NAME} doesn't exist in {CATALOG}.{DATABASE}.")
111 |     else:
112 |       _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0")
113 | 
114 |       #XXX: Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation
115 |       window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time"))
116 |       stream_data_df = stream_data_dynf.toDF()
117 |       stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss'))
118 |       upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \
119 |         .filter(col("row") == 1).drop("row") \
120 |         .select(_df.schema.names)
121 | 
122 |       upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert")
123 |       # print(f"Table '{TABLE_NAME}' is upserting...")
124 | 
125 |       try:
126 |         spark.sql(f"""MERGE INTO {CATALOG}.{DATABASE}.{TABLE_NAME} t
127 |           USING {TABLE_NAME}_upsert s ON s.{PRIMARY_KEY} = t.{PRIMARY_KEY}
128 |           WHEN MATCHED THEN UPDATE SET *
129 |           WHEN NOT MATCHED THEN INSERT *
130 |           """)
131 |       except Exception as ex:
132 |         traceback.print_exc()
133 |         raise ex
134 | 
135 | 
136 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/")
137 | 
138 | glueContext.forEachBatch(
139 |   frame=streaming_data,
140 |   batch_function=processBatch,
141 |   options={
142 |     "windowSize": WINDOW_SIZE,
143 |     "checkpointLocation": checkpointPath,
144 |   }
145 | )
146 | 
147 | job.commit()
148 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/glue_job_role.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_iam
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | 
 14 | class GlueJobRoleStack(Stack):
 15 | 
 16 |   def __init__(self, scope: Construct, construct_id: str, msk_cluster_name, **kwargs) -> None:
 17 |     super().__init__(scope, construct_id, **kwargs)
 18 | 
 19 |     glue_job_role_policy_doc = aws_iam.PolicyDocument()
 20 |     glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 21 |       "sid": "AWSGlueJobDynamoDBAccess",
 22 |       "effect": aws_iam.Effect.ALLOW,
 23 |       #XXX: The ARN will be formatted as follows:
 24 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 25 |       "resources": [self.format_arn(service="dynamodb", resource="table", resource_name="*")],
 26 |       "actions": [
 27 |         "dynamodb:BatchGetItem",
 28 |         "dynamodb:DescribeStream",
 29 |         "dynamodb:DescribeTable",
 30 |         "dynamodb:GetItem",
 31 |         "dynamodb:Query",
 32 |         "dynamodb:Scan",
 33 |         "dynamodb:BatchWriteItem",
 34 |         "dynamodb:CreateTable",
 35 |         "dynamodb:DeleteTable",
 36 |         "dynamodb:DeleteItem",
 37 |         "dynamodb:UpdateTable",
 38 |         "dynamodb:UpdateItem",
 39 |         "dynamodb:PutItem"
 40 |       ]
 41 |     }))
 42 | 
 43 |     glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 44 |       "sid": "AWSGlueJobS3Access",
 45 |       "effect": aws_iam.Effect.ALLOW,
 46 |       #XXX: The ARN will be formatted as follows:
 47 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 48 |       "resources": ["*"],
 49 |       "actions": [
 50 |         "s3:GetBucketLocation",
 51 |         "s3:ListBucket",
 52 |         "s3:GetBucketAcl",
 53 |         "s3:GetObject",
 54 |         "s3:PutObject",
 55 |         "s3:DeleteObject"
 56 |       ]
 57 |     }))
 58 | 
 59 |     glue_job_role = aws_iam.Role(self, 'GlueJobRole',
 60 |       role_name='GlueJobRole-MSKServerless2Iceberg',
 61 |       assumed_by=aws_iam.ServicePrincipal('glue.amazonaws.com'),
 62 |       inline_policies={
 63 |         'aws_glue_job_role_policy': glue_job_role_policy_doc
 64 |       },
 65 |       managed_policies=[
 66 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole'),
 67 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMReadOnlyAccess'),
 68 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'),
 69 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AWSGlueConsoleFullAccess'),
 70 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonMSKReadOnlyAccess'),
 71 |         # aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess')
 72 |       ]
 73 |     )
 74 | 
 75 |     #XXX: When creating a notebook with a role, that role is then passed to interactive sessions
 76 |     # so that the same role can be used in both places.
 77 |     # As such, the `iam:PassRole` permission needs to be part of the role's policy.
 78 |     # More info at: https://docs.aws.amazon.com/glue/latest/ug/notebook-getting-started.html
 79 |     #
 80 |     glue_job_role.add_to_policy(aws_iam.PolicyStatement(**{
 81 |       "sid": "AWSGlueJobIAMPassRole",
 82 |       "effect": aws_iam.Effect.ALLOW,
 83 |       #XXX: The ARN will be formatted as follows:
 84 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name}
 85 |       "resources": [self.format_arn(service="iam", region="", resource="role", resource_name=glue_job_role.role_name)],
 86 |       "conditions": {
 87 |         "StringLike": {
 88 |           "iam:PassedToService": [
 89 |             "glue.amazonaws.com"
 90 |           ]
 91 |         }
 92 |       },
 93 |       "actions": [
 94 |         "iam:PassRole"
 95 |       ]
 96 |     }))
 97 | 
 98 |     #XXX: For more information, see https://docs.aws.amazon.com/msk/latest/developerguide/create-iam-role.html
 99 |     kafka_access_control_iam_policy = aws_iam.Policy(self, 'KafkaAccessControlIAMPolicy',
100 |       statements=[
101 |         aws_iam.PolicyStatement(**{
102 |           "effect": aws_iam.Effect.ALLOW,
103 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:cluster/{msk_cluster_name}/*" ],
104 |           "actions": [
105 |             "kafka-cluster:Connect",
106 |             "kafka-cluster:AlterCluster",
107 |             "kafka-cluster:DescribeCluster"
108 |           ]
109 |         }),
110 |         aws_iam.PolicyStatement(**{
111 |           "effect": aws_iam.Effect.ALLOW,
112 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:topic/{msk_cluster_name}/*" ],
113 |           "actions": [
114 |             "kafka-cluster:*Topic*",
115 |             "kafka-cluster:WriteData",
116 |             "kafka-cluster:ReadData"
117 |           ]
118 |         }),
119 |         aws_iam.PolicyStatement(**{
120 |           "effect": aws_iam.Effect.ALLOW,
121 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:group/{msk_cluster_name}/*" ],
122 |           "actions": [
123 |             "kafka-cluster:AlterGroup",
124 |             "kafka-cluster:DescribeGroup"
125 |           ]
126 |         })
127 |       ]
128 |     )
129 |     kafka_access_control_iam_policy.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
130 |     kafka_access_control_iam_policy.attach_to_role(glue_job_role)
131 | 
132 |     self.glue_job_role = glue_job_role
133 | 
134 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRole', value=self.glue_job_role.role_name)
135 |     cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=self.glue_job_role.role_arn)
136 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/cdk_stacks/kafka_client_ec2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import random
  7 | import string
  8 | 
  9 | import aws_cdk as cdk
 10 | 
 11 | from aws_cdk import (
 12 |   Stack,
 13 |   aws_ec2,
 14 |   aws_iam,
 15 |   aws_s3_assets
 16 | )
 17 | from constructs import Construct
 18 | 
 19 | random.seed(37)
 20 | 
 21 | 
 22 | class KafkaClientEC2InstanceStack(Stack):
 23 | 
 24 |   def __init__(self, scope: Construct, construct_id: str, vpc, sg_msk_client, msk_cluster_name, **kwargs) -> None:
 25 |     super().__init__(scope, construct_id, **kwargs)
 26 | 
 27 |     KAFKA_CLIENT_EC2_SG_NAME = 'kafka-client-ec2-sg-{}'.format(''.join(random.choices((string.ascii_lowercase), k=5)))
 28 |     sg_kafka_client_ec2_instance = aws_ec2.SecurityGroup(self, 'KafkaClientEC2InstanceSG',
 29 |       vpc=vpc,
 30 |       allow_all_outbound=True,
 31 |       description='security group for Kafka Client EC2 Instance',
 32 |       security_group_name=KAFKA_CLIENT_EC2_SG_NAME
 33 |     )
 34 |     cdk.Tags.of(sg_kafka_client_ec2_instance).add('Name', KAFKA_CLIENT_EC2_SG_NAME)
 35 |     sg_kafka_client_ec2_instance.add_ingress_rule(peer=aws_ec2.Peer.ipv4("0.0.0.0/0"),
 36 |       connection=aws_ec2.Port.tcp(22))
 37 | 
 38 |     #XXX: For more information, see https://docs.aws.amazon.com/msk/latest/developerguide/create-iam-role.html
 39 |     kafka_client_iam_policy = aws_iam.Policy(self, 'KafkaClientIAMPolicy',
 40 |       statements=[
 41 |         aws_iam.PolicyStatement(**{
 42 |           "effect": aws_iam.Effect.ALLOW,
 43 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:cluster/{msk_cluster_name}/*" ],
 44 |           "actions": [
 45 |             "kafka-cluster:Connect",
 46 |             "kafka-cluster:AlterCluster",
 47 |             "kafka-cluster:DescribeCluster"
 48 |           ]
 49 |         }),
 50 |         aws_iam.PolicyStatement(**{
 51 |           "effect": aws_iam.Effect.ALLOW,
 52 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:topic/{msk_cluster_name}/*" ],
 53 |           "actions": [
 54 |             "kafka-cluster:*Topic*",
 55 |             "kafka-cluster:WriteData",
 56 |             "kafka-cluster:ReadData"
 57 |           ]
 58 |         }),
 59 |         aws_iam.PolicyStatement(**{
 60 |           "effect": aws_iam.Effect.ALLOW,
 61 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:group/{msk_cluster_name}/*" ],
 62 |           "actions": [
 63 |             "kafka-cluster:AlterGroup",
 64 |             "kafka-cluster:DescribeGroup"
 65 |           ]
 66 |         })
 67 |       ]
 68 |     )
 69 |     kafka_client_iam_policy.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
 70 | 
 71 |     kafka_client_ec2_instance_role = aws_iam.Role(self, 'KafkaClientEC2InstanceRole',
 72 |       role_name=f'KafkaClientEC2InstanceRole-{self.stack_name}',
 73 |       assumed_by=aws_iam.ServicePrincipal('ec2.amazonaws.com'),
 74 |       managed_policies=[
 75 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMManagedInstanceCore'),
 76 |         #XXX: EC2 instance should be able to access S3 for user data
 77 |         # aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonS3ReadOnlyAccess')
 78 |       ]
 79 |     )
 80 | 
 81 |     kafka_client_iam_policy.attach_to_role(kafka_client_ec2_instance_role)
 82 | 
 83 |     amzn_linux = aws_ec2.MachineImage.latest_amazon_linux(
 84 |       generation=aws_ec2.AmazonLinuxGeneration.AMAZON_LINUX_2,
 85 |       edition=aws_ec2.AmazonLinuxEdition.STANDARD,
 86 |       virtualization=aws_ec2.AmazonLinuxVirt.HVM,
 87 |       storage=aws_ec2.AmazonLinuxStorage.GENERAL_PURPOSE,
 88 |       cpu_type=aws_ec2.AmazonLinuxCpuType.X86_64
 89 |     )
 90 | 
 91 |     msk_client_ec2_instance = aws_ec2.Instance(self, 'KafkaClientEC2Instance',
 92 |       instance_type=aws_ec2.InstanceType.of(instance_class=aws_ec2.InstanceClass.BURSTABLE2,
 93 |         instance_size=aws_ec2.InstanceSize.MICRO),
 94 |       machine_image=amzn_linux,
 95 |       vpc=vpc,
 96 |       availability_zone=vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).availability_zones[0],
 97 |       instance_name=f'KafkaClientInstance-{self.stack_name}',
 98 |       role=kafka_client_ec2_instance_role,
 99 |       security_group=sg_kafka_client_ec2_instance,
100 |       vpc_subnets=aws_ec2.SubnetSelection(subnet_type=aws_ec2.SubnetType.PUBLIC)
101 |     )
102 |     msk_client_ec2_instance.add_security_group(sg_msk_client)
103 | 
104 |     # test data generator script in S3 as Asset
105 |     user_data_asset = aws_s3_assets.Asset(self, 'KafkaClientEC2UserData',
106 |       path=os.path.join(os.path.dirname(__file__), '../src/utils/gen_fake_data.py'))
107 |     user_data_asset.grant_read(msk_client_ec2_instance.role)
108 | 
109 |     USER_DATA_LOCAL_PATH = msk_client_ec2_instance.user_data.add_s3_download_command(
110 |       bucket=user_data_asset.bucket,
111 |       bucket_key=user_data_asset.s3_object_key
112 |     )
113 | 
114 |     commands = '''
115 | yum update -y 
116 | yum install python3.7 -y
117 | yum install java-11 -y
118 | yum install -y jq
119 | 
120 | mkdir -p /home/ec2-user/opt
121 | cd /home/ec2-user/opt
122 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
123 | tar -xzf kafka_2.12-2.8.1.tgz
124 | ln -nsf kafka_2.12-2.8.1 kafka
125 | 
126 | cd /home/ec2-user/opt/kafka/libs
127 | wget https://github.com/aws/aws-msk-iam-auth/releases/download/v1.1.1/aws-msk-iam-auth-1.1.1-all.jar
128 | 
129 | chown -R ec2-user /home/ec2-user/opt
130 | chgrp -R ec2-user /home/ec2-user/opt
131 | 
132 | cd /home/ec2-user
133 | wget https://bootstrap.pypa.io/get-pip.py
134 | su -c "python3.7 get-pip.py --user" -s /bin/sh ec2-user
135 | su -c "/home/ec2-user/.local/bin/pip3 install boto3 --user" -s /bin/sh ec2-user
136 | 
137 | cat <<EOF > msk_serverless_client.properties
138 | security.protocol=SASL_SSL
139 | sasl.mechanism=AWS_MSK_IAM
140 | sasl.jaas.config=software.amazon.msk.auth.iam.IAMLoginModule required;
141 | sasl.client.callback.handler.class=software.amazon.msk.auth.iam.IAMClientCallbackHandler
142 | EOF
143 | 
144 | ln -nsf msk_serverless_client.properties client.properties
145 | chown -R ec2-user /home/ec2-user/msk_serverless_client.properties
146 | chown -R ec2-user /home/ec2-user/client.properties
147 | 
148 | echo 'export PATH=$HOME/opt/kafka/bin:$PATH' >> .bash_profile
149 | '''
150 | 
151 |     commands += f'''
152 | su -c "/home/ec2-user/.local/bin/pip3 install mimesis==4.1.3 --user" -s /bin/sh ec2-user
153 | cp {USER_DATA_LOCAL_PATH} /home/ec2-user/gen_fake_data.py & chown -R ec2-user /home/ec2-user/gen_fake_data.py
154 | '''
155 | 
156 |     msk_client_ec2_instance.user_data.add_commands(commands)
157 | 
158 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstancePublicDNS',
159 |       value=msk_client_ec2_instance.instance_public_dns_name,
160 |       export_name=f'{self.stack_name}-EC2InstancePublicDNS')
161 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstanceId',
162 |       value=msk_client_ec2_instance.instance_id,
163 |       export_name=f'{self.stack_name}-EC2InstanceId')
164 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstanceAZ',
165 |       value=msk_client_ec2_instance.instance_availability_zone,
166 |       export_name=f'{self.stack_name}-EC2InstanceAZ')
167 | 
168 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/cdk_stacks/kafka_client_ec2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | import random
  7 | import string
  8 | 
  9 | import aws_cdk as cdk
 10 | 
 11 | from aws_cdk import (
 12 |   Stack,
 13 |   aws_ec2,
 14 |   aws_iam,
 15 |   aws_s3_assets
 16 | )
 17 | from constructs import Construct
 18 | 
 19 | random.seed(37)
 20 | 
 21 | 
 22 | class KafkaClientEC2InstanceStack(Stack):
 23 | 
 24 |   def __init__(self, scope: Construct, construct_id: str, vpc, sg_msk_client, msk_cluster_name, **kwargs) -> None:
 25 |     super().__init__(scope, construct_id, **kwargs)
 26 | 
 27 |     KAFKA_CLIENT_EC2_SG_NAME = 'kafka-client-ec2-sg-{}'.format(''.join(random.choices((string.ascii_lowercase), k=5)))
 28 |     sg_kafka_client_ec2_instance = aws_ec2.SecurityGroup(self, 'KafkaClientEC2InstanceSG',
 29 |       vpc=vpc,
 30 |       allow_all_outbound=True,
 31 |       description='security group for Kafka Client EC2 Instance',
 32 |       security_group_name=KAFKA_CLIENT_EC2_SG_NAME
 33 |     )
 34 |     cdk.Tags.of(sg_kafka_client_ec2_instance).add('Name', KAFKA_CLIENT_EC2_SG_NAME)
 35 |     sg_kafka_client_ec2_instance.add_ingress_rule(peer=aws_ec2.Peer.ipv4("0.0.0.0/0"),
 36 |       connection=aws_ec2.Port.tcp(22))
 37 | 
 38 |     #XXX: For more information, see https://docs.aws.amazon.com/msk/latest/developerguide/create-iam-role.html
 39 |     kafka_client_iam_policy = aws_iam.Policy(self, 'KafkaClientIAMPolicy',
 40 |       statements=[
 41 |         aws_iam.PolicyStatement(**{
 42 |           "effect": aws_iam.Effect.ALLOW,
 43 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:cluster/{msk_cluster_name}/*" ],
 44 |           "actions": [
 45 |             "kafka-cluster:Connect",
 46 |             "kafka-cluster:AlterCluster",
 47 |             "kafka-cluster:DescribeCluster"
 48 |           ]
 49 |         }),
 50 |         aws_iam.PolicyStatement(**{
 51 |           "effect": aws_iam.Effect.ALLOW,
 52 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:topic/{msk_cluster_name}/*" ],
 53 |           "actions": [
 54 |             "kafka-cluster:Connect",
 55 |             "kafka-cluster:*Topic*",
 56 |             "kafka-cluster:WriteData",
 57 |             "kafka-cluster:ReadData"
 58 |           ]
 59 |         }),
 60 |         aws_iam.PolicyStatement(**{
 61 |           "effect": aws_iam.Effect.ALLOW,
 62 |           "resources": [ f"arn:aws:kafka:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:group/{msk_cluster_name}/*" ],
 63 |           "actions": [
 64 |             "kafka-cluster:AlterGroup",
 65 |             "kafka-cluster:DescribeGroup"
 66 |           ]
 67 |         })
 68 |       ]
 69 |     )
 70 |     kafka_client_iam_policy.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
 71 | 
 72 |     kafka_client_ec2_instance_role = aws_iam.Role(self, 'KafkaClientEC2InstanceRole',
 73 |       role_name=f'KafkaClientEC2InstanceRole-{self.stack_name}',
 74 |       assumed_by=aws_iam.ServicePrincipal('ec2.amazonaws.com'),
 75 |       managed_policies=[
 76 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMManagedInstanceCore'),
 77 |         #XXX: EC2 instance should be able to access S3 for user data
 78 |         # aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonS3ReadOnlyAccess')
 79 |       ]
 80 |     )
 81 | 
 82 |     kafka_client_iam_policy.attach_to_role(kafka_client_ec2_instance_role)
 83 | 
 84 |     amzn_linux = aws_ec2.MachineImage.latest_amazon_linux(
 85 |       generation=aws_ec2.AmazonLinuxGeneration.AMAZON_LINUX_2,
 86 |       edition=aws_ec2.AmazonLinuxEdition.STANDARD,
 87 |       virtualization=aws_ec2.AmazonLinuxVirt.HVM,
 88 |       storage=aws_ec2.AmazonLinuxStorage.GENERAL_PURPOSE,
 89 |       cpu_type=aws_ec2.AmazonLinuxCpuType.X86_64
 90 |     )
 91 | 
 92 |     msk_client_ec2_instance = aws_ec2.Instance(self, 'KafkaClientEC2Instance',
 93 |       instance_type=aws_ec2.InstanceType.of(instance_class=aws_ec2.InstanceClass.BURSTABLE2,
 94 |         instance_size=aws_ec2.InstanceSize.MICRO),
 95 |       machine_image=amzn_linux,
 96 |       vpc=vpc,
 97 |       availability_zone=vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).availability_zones[0],
 98 |       instance_name=f'KafkaClientInstance-{self.stack_name}',
 99 |       role=kafka_client_ec2_instance_role,
100 |       security_group=sg_kafka_client_ec2_instance,
101 |       vpc_subnets=aws_ec2.SubnetSelection(subnet_type=aws_ec2.SubnetType.PUBLIC)
102 |     )
103 |     msk_client_ec2_instance.add_security_group(sg_msk_client)
104 | 
105 |     # test data generator script in S3 as Asset
106 |     user_data_asset = aws_s3_assets.Asset(self, 'KafkaClientEC2UserData',
107 |       path=os.path.join(os.path.dirname(__file__), '../src/utils/gen_fake_data.py'))
108 |     user_data_asset.grant_read(msk_client_ec2_instance.role)
109 | 
110 |     USER_DATA_LOCAL_PATH = msk_client_ec2_instance.user_data.add_s3_download_command(
111 |       bucket=user_data_asset.bucket,
112 |       bucket_key=user_data_asset.s3_object_key
113 |     )
114 | 
115 |     commands = '''
116 | yum update -y 
117 | yum install python3.7 -y
118 | yum install java-11 -y
119 | yum install -y jq
120 | 
121 | mkdir -p /home/ec2-user/opt
122 | cd /home/ec2-user/opt
123 | wget https://archive.apache.org/dist/kafka/2.8.1/kafka_2.12-2.8.1.tgz
124 | tar -xzf kafka_2.12-2.8.1.tgz
125 | ln -nsf kafka_2.12-2.8.1 kafka
126 | 
127 | cd /home/ec2-user/opt/kafka/libs
128 | wget https://github.com/aws/aws-msk-iam-auth/releases/download/v1.1.1/aws-msk-iam-auth-1.1.1-all.jar
129 | 
130 | chown -R ec2-user /home/ec2-user/opt
131 | chgrp -R ec2-user /home/ec2-user/opt
132 | 
133 | cd /home/ec2-user
134 | wget https://bootstrap.pypa.io/get-pip.py
135 | su -c "python3.7 get-pip.py --user" -s /bin/sh ec2-user
136 | su -c "/home/ec2-user/.local/bin/pip3 install boto3 --user" -s /bin/sh ec2-user
137 | 
138 | cat <<EOF > msk_serverless_client.properties
139 | security.protocol=SASL_SSL
140 | sasl.mechanism=AWS_MSK_IAM
141 | sasl.jaas.config=software.amazon.msk.auth.iam.IAMLoginModule required;
142 | sasl.client.callback.handler.class=software.amazon.msk.auth.iam.IAMClientCallbackHandler
143 | EOF
144 | 
145 | ln -nsf msk_serverless_client.properties client.properties
146 | chown -R ec2-user /home/ec2-user/msk_serverless_client.properties
147 | chown -R ec2-user /home/ec2-user/client.properties
148 | 
149 | echo 'export PATH=$HOME/opt/kafka/bin:$PATH' >> .bash_profile
150 | '''
151 | 
152 |     commands += f'''
153 | su -c "/home/ec2-user/.local/bin/pip3 install mimesis==4.1.3 --user" -s /bin/sh ec2-user
154 | cp {USER_DATA_LOCAL_PATH} /home/ec2-user/gen_fake_data.py & chown -R ec2-user /home/ec2-user/gen_fake_data.py
155 | '''
156 | 
157 |     msk_client_ec2_instance.user_data.add_commands(commands)
158 | 
159 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstancePublicDNS',
160 |       value=msk_client_ec2_instance.instance_public_dns_name,
161 |       export_name=f'{self.stack_name}-EC2InstancePublicDNS')
162 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstanceId',
163 |       value=msk_client_ec2_instance.instance_id,
164 |       export_name=f'{self.stack_name}-EC2InstanceId')
165 |     cdk.CfnOutput(self, f'{self.stack_name}-EC2InstanceAZ',
166 |       value=msk_client_ec2_instance.instance_availability_zone,
167 |       export_name=f'{self.stack_name}-EC2InstanceAZ')
168 | 
169 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # AWS Glue Streaming ETL Job with Amazon MSK and Apace Iceberg
  3 | 
  4 | ![glue-streaming-data-from-kafka-to-iceberg-table](./glue-streaming-data-from-kafka-to-iceberg-table.svg)
  5 | 
  6 | In this project, we create a streaming ETL job in AWS Glue to integrate Iceberg with a streaming use case and create an in-place updatable data lake on Amazon S3.
  7 | 
  8 | After streaming data are ingested from Amazon Managed Service for Apache Kafka (MSK) to Amazon S3, you can query the data with [Amazon Athena](http://aws.amazon.com/athena).
  9 | 
 10 | This project can be deployed with [AWS CDK Python](https://docs.aws.amazon.com/cdk/api/v2/).
 11 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 12 | 
 13 | This project is set up like a standard Python project.  The initialization
 14 | process also creates a virtualenv within this project, stored under the `.venv`
 15 | directory.  To create the virtualenv it assumes that there is a `python3`
 16 | (or `python` for Windows) executable in your path with access to the `venv`
 17 | package. If for any reason the automatic creation of the virtualenv fails,
 18 | you can create the virtualenv manually.
 19 | 
 20 | To manually create a virtualenv on MacOS and Linux:
 21 | 
 22 | ```
 23 | $ python3 -m venv .venv
 24 | ```
 25 | 
 26 | After the init process completes and the virtualenv is created, you can use the following
 27 | step to activate your virtualenv.
 28 | 
 29 | ```
 30 | $ source .venv/bin/activate
 31 | ```
 32 | 
 33 | If you are a Windows platform, you would activate the virtualenv like this:
 34 | 
 35 | ```
 36 | % .venv\Scripts\activate.bat
 37 | ```
 38 | 
 39 | Once the virtualenv is activated, you can install the required dependencies.
 40 | 
 41 | ```
 42 | (.venv) $ pip install -r requirements.txt
 43 | ```
 44 | 
 45 | In case of `AWS Glue 3.0`, before synthesizing the CloudFormation, **you first set up Apache Iceberg connector for AWS Glue to use Apache Iceber with AWS Glue jobs.** (For more information, see [References](#references) (2))
 46 | 
 47 | Then you should set approperly the cdk context configuration file, `cdk.context.json`.
 48 | 
 49 | For example:
 50 | <pre>
 51 | {
 52 |   "vpc_name": "default",
 53 |   "msk": {
 54 |     "cluster_name": "iceberg-demo-stream",
 55 |     "kafka_version": "2.8.1",
 56 |     "broker_instance_type": "kafka.m5.large",
 57 |     "number_of_broker_nodes": 3,
 58 |     "broker_ebs_volume_size": 100
 59 |   },
 60 |   "glue_assets_s3_bucket_name": "aws-glue-assets-123456789012-atq4q5u",
 61 |   "glue_job_script_file_name": "spark_sql_merge_into_iceberg_from_kafka.py",
 62 |   "glue_job_name": "streaming_data_from_kafka_into_iceberg_table",
 63 |   "glue_job_input_arguments": {
 64 |     "--catalog": "job_catalog",
 65 |     "--database_name": "iceberg_demo_db",
 66 |     "--table_name": "iceberg_demo_table",
 67 |     "--primary_key": "name",
 68 |     "--kafka_topic_name": "ev_stream_data",
 69 |     "--starting_offsets_of_kafka_topic": "latest",
 70 |     "--iceberg_s3_path": "s3://glue-iceberg-demo-atq4q5u/iceberg_demo_db",
 71 |     "--lock_table_name": "iceberg_lock",
 72 |     "--aws_region": "us-east-1",
 73 |     "--window_size": "100 seconds",
 74 |     "--extra-jars": "s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar",
 75 |     "--user-jars-first": "true"
 76 |   },
 77 |   "glue_connections_name": "iceberg-connection"
 78 | }
 79 | </pre>
 80 | 
 81 | :information_source: `--primary_key` option should be set by Iceberg table's primary column name.
 82 | 
 83 | :warning: **You should create a S3 bucket for a glue job script and upload the glue job script file into the s3 bucket.**
 84 | 
 85 | At this point you can now synthesize the CloudFormation template for this code.
 86 | 
 87 | <pre>
 88 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
 89 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
 90 | (.venv) $ cdk synth --all
 91 | </pre>
 92 | 
 93 | To add additional dependencies, for example other CDK libraries, just add
 94 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
 95 | command.
 96 | 
 97 | ## Run Test
 98 | 
 99 | 1. Set up **Apache Iceberg connector for AWS Glue** to use Apache Iceberg with AWS Glue jobs.
100 | 2. Create a S3 bucekt for Apache Iceberg table
101 |    <pre>
102 |    (.venv) $ cdk deploy KafkaToIcebergS3Path
103 |    </pre>
104 | 3. Create a MSK
105 |    <pre>
106 |    (.venv) $ cdk deploy KafkaToIcebergStackVpc KafkaAsGlueStreamingJobDataSource
107 |    </pre>
108 | 4. Create a MSK connector for Glue Streaming Job
109 |    <pre>
110 |    (.venv) $ cdk deploy GlueMSKConnection
111 |    </pre>
112 |    For more information, see [References](#references) (8)
113 | 5. Create a IAM Role for Glue Streaming Job
114 |    <pre>
115 |    (.venv) $ cdk deploy GlueStreamingMSKtoIcebergJobRole
116 |    </pre>
117 | 6. Set up a Kafka Client Machine
118 |    <pre>
119 |    (.venv) $ cdk deploy KafkaClientEC2Instance
120 |    </pre>
121 | 7. Create a Glue Database for an Apache Iceberg table
122 |    <pre>
123 |    (.venv) $ cdk deploy GlueIcebergDatabase
124 |    </pre>
125 | 8. Upload **AWS SDK for Java 2.x** jar file into S3
126 |    <pre>
127 |    (.venv) $ wget https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.17.224/aws-sdk-java-2.17.224.jar
128 |    (.venv) $ aws s3 cp aws-sdk-java-2.17.224.jar s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar
129 |    </pre>
130 |    A Glue Streaming Job might fail because of the following error:
131 |    <pre>
132 |    py4j.protocol.Py4JJavaError: An error occurred while calling o135.start.
133 |    : java.lang.NoSuchMethodError: software.amazon.awssdk.utils.SystemSetting.getStringValueFromEnvironmentVariable(Ljava/lang/String;)Ljava/util/Optional
134 |    </pre>
135 |    We can work around the problem by starting the Glue Job with the additional parameters:
136 |    <pre>
137 |    --extra-jars <i>s3://path/to/aws-sdk-for-java-v2.jar</i>
138 |    --user-jars-first true
139 |    </pre>
140 |    In order to do this, we might need to upload **AWS SDK for Java 2.x** jar file into S3.
141 | 9. Create a Glue Streaming Job
142 | 
143 |    * (step 1) Select one of Glue Job Scripts and upload into S3
144 | 
145 |      **List of Glue Job Scirpts**
146 |      | File name | Spark Writes |
147 |      |-----------|--------------|
148 |      | spark_dataframe_insert_iceberg_from_kafka.py | DataFrame append |
149 |      | spark_sql_insert_overwrite_iceberg_from_kafka.py | SQL insert overwrite |
150 |      | spark_sql_merge_into_iceberg_from_kafka.py | SQL merge into |
151 | 
152 |      <pre>
153 |      (.venv) $ ls src/main/python/
154 |       spark_dataframe_insert_iceberg_from_kafka.py
155 |       spark_sql_insert_overwrite_iceberg_from_kafka.py
156 |       spark_sql_merge_into_iceberg_from_kafka.py
157 |      (.venv) $ aws s3 mb <i>s3://aws-glue-assets-123456789012-atq4q5u</i> --region <i>us-east-1</i>
158 |      (.venv) $ aws s3 cp src/main/python/spark_sql_merge_into_iceberg_from_kafka.py <i>s3://aws-glue-assets-123456789012-atq4q5u/scripts/</i>
159 |      </pre>
160 | 
161 |    * (step 2) Provision the Glue Streaming Job
162 | 
163 |      <pre>
164 |      (.venv) $ cdk deploy GrantLFPermissionsOnGlueJobRole \
165 |                           GlueStreamingJobMSKtoIceberg
166 |      </pre>
167 | 
168 | 10. Create a table with partitioned data in Amazon Athena
169 | 
170 |     Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
171 | 
172 |    * (step 1) Create a database
173 | 
174 |      In order to create a new database called `iceberg_demo_db`, enter the following statement in the Athena query editor
175 |      and click the **Run** button to execute the query.
176 | 
177 |      <pre>
178 |      CREATE DATABASE IF NOT EXISTS iceberg_demo_db
179 |      </pre>
180 | 
181 |    * (step 2) Create a table
182 | 
183 |       Copy the following query into the Athena query editor, replace the `xxxxxxx` in the last line under `LOCATION` with the string of your S3 bucket, and execute the query to create a new table.
184 |       <pre>
185 |       CREATE TABLE iceberg_demo_db.iceberg_demo_table (
186 |         name string,
187 |         age int,
188 |         m_time timestamp
189 |       )
190 |       PARTITIONED BY (`name`)
191 |       LOCATION 's3://glue-iceberg-demo-atq4q5u/iceberg_demo_db/iceberg_demo_table'
192 |       TBLPROPERTIES (
193 |         'table_type'='iceberg'
194 |       );
195 |       </pre>
196 |       If the query is successful, a table named `iceberg_demo_table` is created and displayed on the left panel under the **Tables** section.
197 | 
198 |       If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have mydatabase selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**.
199 | 
200 |       :information_source: If you fail to create the table, give Athena users access permissions on `iceberg_demo_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant anyone using Athena to access `iceberg_demo_db` by running the following command:
201 |       <pre>
202 |       (.venv) $ aws lakeformation grant-permissions \
203 |                 --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:user/<i>example-user-id</i> \
204 |                 --permissions CREATE_TABLE DESCRIBE ALTER DROP \
205 |                 --resource '{ "Database": { "Name": "<i>iceberg_demo_db</i>" } }'
206 |       (.venv) $ aws lakeformation grant-permissions \
207 |               --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:user/<i>example-user-id</i> \
208 |               --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
209 |               --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
210 |       </pre>
211 | 
212 | 11. Make sure the glue job to access the Iceberg table in the Glue Catalog database
213 | 
214 |     We can get permissions by running the following command:
215 |     <pre>
216 |     (.venv) $ aws lakeformation list-permissions | jq -r '.PrincipalResourcePermissions[] | select(.Principal.DataLakePrincipalIdentifier | endswith(":role/GlueStreamingJobRole-MSK2Iceberg"))'
217 |     </pre>
218 |     If not found, we need manually to grant the glue job to required permissions by running the following command:
219 |     <pre>
220 |     (.venv) $ aws lakeformation grant-permissions \
221 |                --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>GlueStreamingJobRole-MSK2Iceberg</i> \
222 |                --permissions SELECT DESCRIBE ALTER INSERT DELETE \
223 |                --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
224 |     </pre>
225 | 
226 | 12. Run glue job to load data from MSK into S3
227 |     <pre>
228 |     (.venv) $ aws glue start-job-run --job-name <i>streaming_data_from_kafka_into_iceberg_table</i>
229 |     </pre>
230 | 
231 | 13. Generate streaming data
232 | 
233 |     1. Connect the MSK client EC2 Host.
234 | 
235 |        You can connect to an EC2 instance using the EC2 Instance Connect CLI.<br/>
236 |        Install `ec2instanceconnectcli` python package and Use the **mssh** command with the instance ID as follows.
237 |        <pre>
238 |        $ sudo pip install ec2instanceconnectcli
239 |        $ mssh ec2-user@<i>i-001234a4bf70dec41EXAMPLE</i>
240 |        </pre>
241 | 
242 |     2. Create an Apache Kafka topic
243 |        After connect your EC2 Host, you use the client machine to create a topic on the cluster.
244 |        Run the following command to create a topic called `ev_stream_data`.
245 |        <pre>
246 |        [ec2-user@ip-172-31-0-180 ~]$ export PATH=$HOME/opt/kafka/bin:$PATH
247 |        [ec2-user@ip-172-31-0-180 ~]$ export BS=<i>{BootstrapBrokerString}</i>
248 |        [ec2-user@ip-172-31-0-180 ~]$ kafka-topics.sh --bootstrap-server $BS \
249 |           --create \
250 |           --topic <i>ev_stream_data</i> \
251 |           --partitions 3 \
252 |           --replication-factor 2
253 |        </pre>
254 | 
255 |     3. Produce and consume data
256 | 
257 |        **(1) To produce messages**
258 | 
259 |        Run the following command to generate messages into the topic on the cluster.
260 | 
261 |        <pre>
262 |        [ec2-user@ip-172-31-0-180 ~]$ python3 gen_fake_data.py | kafka-console-producer.sh \
263 |           --bootstrap-server $BS \
264 |           --topic <i>ev_stream_data</i> \
265 |           --property parse.key=true \
266 |           --property key.seperator='\t'
267 |        </pre>
268 | 
269 |        **(2) To consume messages**
270 | 
271 |        Keep the connection to the client machine open, and then open a second, separate connection to that machine in a new window.
272 | 
273 |        <pre>
274 |        [ec2-user@ip-172-31-0-180 ~]$ kafka-console-consumer.sh --bootstrap-server $BS \
275 |           --topic <i>ev_stream_data</i> \
276 |           --from-beginning
277 |        </pre>
278 | 
279 |        You start seeing the messages you entered earlier when you used the console producer command.
280 |        Enter more messages in the producer window, and watch them appear in the consumer window.
281 | 
282 |     We can synthetically generate data in JSON format using a simple Python application.
283 | 
284 |     Synthentic Data Example order by `name` and `m_time`
285 |     <pre>
286 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
287 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
288 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
289 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
290 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
291 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
292 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
293 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
294 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
295 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
296 |     </pre>
297 | 
298 |     Spark Writes using `DataFrame append` insert all records into the Iceberg table.
299 |     <pre>
300 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
301 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
302 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
303 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
304 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
305 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
306 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
307 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
308 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
309 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
310 |     </pre>
311 | 
312 |     Spark Writes using `SQL insert overwrite` or `SQL merge into` insert the last updated records into the Iceberg table.
313 |     <pre>
314 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
315 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
316 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
317 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
318 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
319 |     </pre>
320 | 
321 | 14. Check streaming data in S3
322 | 
323 |     After `3~5` minutes, you can see that the streaming data have been delivered from **MSK** to **S3**.
324 | 
325 |     ![iceberg-table](./assets/iceberg-table.png)
326 |     ![iceberg-table](./assets/iceberg-data-level-01.png)
327 |     ![iceberg-table](./assets/iceberg-data-level-02.png)
328 |     ![iceberg-table](./assets/iceberg-data-level-03.png)
329 | 
330 | 15. Run test query
331 | 
332 |     Enter the following SQL statement and execute the query.
333 |     <pre>
334 |     SELECT COUNT(*)
335 |     FROM iceberg_demo_db.iceberg_demo_table;
336 |     </pre>
337 | 
338 | ## Clean Up
339 | 1. Stop the glue streaming job.
340 |    <pre>
341 |    (.venv) $ JOB_RUN_IDS=$(aws glue get-job-runs --job-name <i>streaming_data_from_kafka_into_iceberg_table</i> | jq -r '.JobRuns[] | select(.JobRunState == "RUNNING") | .Id')
342 |    (.venv) $ aws glue batch-stop-job-run --job-name <i>streaming_data_from_kafka_into_iceberg_table</i> --job-run-ids $JOB_RUN_IDS
343 |    </pre>
344 | 2. Delete the CloudFormation stack by running the below command.
345 |    <pre>
346 |    (.venv) $ cdk destroy --all
347 |    </pre>
348 | 
349 | ## Useful commands
350 | 
351 |  * `cdk ls`          list all stacks in the app
352 |  * `cdk synth`       emits the synthesized CloudFormation template
353 |  * `cdk deploy`      deploy this stack to your default AWS account/region
354 |  * `cdk diff`        compare deployed stack with current state
355 |  * `cdk docs`        open CDK documentation
356 | 
357 | ## References
358 | 
359 |  * (1) [AWS Glue versions](https://docs.aws.amazon.com/glue/latest/dg/release-notes.html): The AWS Glue version determines the versions of Apache Spark and Python that AWS Glue supports.
360 |  * (2) [Use the AWS Glue connector to read and write Apache Iceberg tables with ACID transactions and perform time travel \(2022-06-21\)](https://aws.amazon.com/ko/blogs/big-data/use-the-aws-glue-connector-to-read-and-write-apache-iceberg-tables-with-acid-transactions-and-perform-time-travel/)
361 |  * (3) [Spark Stream Processing with Amazon EMR using Apache Kafka streams running in Amazon MSK (2022-06-30)](https://yogender027mae.medium.com/spark-stream-processing-with-amazon-emr-using-apache-kafka-streams-running-in-amazon-msk-9776036c18d9)
362 |    * [yogenderPalChandra/AmazonMSK-EMR-tem-data](https://github.com/yogenderPalChandra/AmazonMSK-EMR-tem-data) - This is repo for medium article "Spark Stream Processing with Amazon EMR using Kafka streams running in Amazon MSK"
363 |  * (4) [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html)
364 |  * (5) [Streaming ETL jobs in AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html)
365 |  * (6) [AWS Glue job parameters](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html)
366 |  * (7) [Connection types and options for ETL in AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-connect.html#aws-glue-programming-etl-connect-kafka)
367 |  * (8) [Creating an AWS Glue connection for an Apache Kafka data stream](https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html#create-conn-streaming)
368 |  * (9) [Crafting serverless streaming ETL jobs with AWS Glue (2020-10-14)](https://aws.amazon.com/ko/blogs/big-data/crafting-serverless-streaming-etl-jobs-with-aws-glue/)
369 |  * (10) [Apache Iceberg - Spark Writes with SQL (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-writes/)
370 |  * (11) [Apache Iceberg - Spark Structured Streaming (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/)
371 |  * (12) [Apache Iceberg - Writing against partitioned table (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table)
372 |    * Iceberg supports append and complete output modes:
373 |      * `append`: appends the rows of every micro-batch to the table
374 |      * `complete`: replaces the table contents every micro-batch
375 | 
376 |        Iceberg requires the data to be sorted according to the partition spec per task (Spark partition) in prior to write against partitioned table.<br/>
377 |        Otherwise, you might encounter the following error:
378 |        <pre>
379 |        pyspark.sql.utils.AnalysisException: Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets;
380 |        </pre>
381 |  * (13) [Apache Iceberg - Maintenance for streaming tables (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#maintenance-for-streaming-tables)
382 |  * (14) [awsglue python package](https://github.com/awslabs/aws-glue-libs): The awsglue Python package contains the Python portion of the AWS Glue library. This library extends PySpark to support serverless ETL on AWS.
383 | 
384 | ## Troubleshooting
385 | 
386 |  * Granting database or table permissions error using AWS CDK
387 |    * Error message:
388 |      <pre>
389 |      AWS::LakeFormation::PrincipalPermissions | CfnPrincipalPermissions Resource handler returned message: "Resource does not exist or requester is not authorized to access requested permissions. (Service: LakeFormation, Status Code: 400, Request ID: f4d5e58b-29b6-4889-9666-7e38420c9035)" (RequestToken: 4a4bb1d6-b051-032f-dd12-5951d7b4d2a9, HandlerErrorCode: AccessDenied)
390 |      </pre>
391 |    * Solution:
392 | 
393 |      The role assumed by cdk is not a data lake administrator. (e.g., `cdk-hnb659fds-deploy-role-12345678912-us-east-1`) <br/>
394 |      So, deploying PrincipalPermissions meets the error such as:
395 | 
396 |      `Resource does not exist or requester is not authorized to access requested permissions.`
397 | 
398 |      In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.<br/>
399 |      For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
400 | 
401 |    * Reference:
402 | 
403 |      [https://github.com/aws-samples/data-lake-as-code](https://github.com/aws-samples/data-lake-as-code) - Data Lake as Code
404 | 
405 | 


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # AWS Glue Streaming ETL Job with Amazon MSK Serverless and Apace Iceberg
  3 | 
  4 | ![glue-streaming-data-from-msk-serverless-to-iceberg-table](./glue-streaming-data-from-msk-serverless-to-iceberg-table.svg)
  5 | 
  6 | In this project, we create a streaming ETL job in AWS Glue to integrate Iceberg with a streaming use case and create an in-place updatable data lake on Amazon S3.
  7 | 
  8 | After streaming data are ingested from Amazon MSK Serverles to Amazon S3, you can query the data with [Amazon Athena](http://aws.amazon.com/athena).
  9 | 
 10 | This project can be deployed with [AWS CDK Python](https://docs.aws.amazon.com/cdk/api/v2/).
 11 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 12 | 
 13 | This project is set up like a standard Python project.  The initialization
 14 | process also creates a virtualenv within this project, stored under the `.venv`
 15 | directory.  To create the virtualenv it assumes that there is a `python3`
 16 | (or `python` for Windows) executable in your path with access to the `venv`
 17 | package. If for any reason the automatic creation of the virtualenv fails,
 18 | you can create the virtualenv manually.
 19 | 
 20 | To manually create a virtualenv on MacOS and Linux:
 21 | 
 22 | ```
 23 | $ python3 -m venv .venv
 24 | ```
 25 | 
 26 | After the init process completes and the virtualenv is created, you can use the following
 27 | step to activate your virtualenv.
 28 | 
 29 | ```
 30 | $ source .venv/bin/activate
 31 | ```
 32 | 
 33 | If you are a Windows platform, you would activate the virtualenv like this:
 34 | 
 35 | ```
 36 | % .venv\Scripts\activate.bat
 37 | ```
 38 | 
 39 | Once the virtualenv is activated, you can install the required dependencies.
 40 | 
 41 | ```
 42 | (.venv) $ pip install -r requirements.txt
 43 | ```
 44 | 
 45 | In case of `AWS Glue 3.0`, before synthesizing the CloudFormation, **you first set up Apache Iceberg connector for AWS Glue to use Apache Iceber with AWS Glue jobs.** (For more information, see [References](#references) (2))
 46 | 
 47 | Then you should set approperly the cdk context configuration file, `cdk.context.json`.
 48 | 
 49 | For example:
 50 | <pre>
 51 | {
 52 |   "vpc_name": "default",
 53 |   "msk_cluster_name": "iceberg-demo-stream",
 54 |   "glue_assets_s3_bucket_name": "aws-glue-assets-123456789012-atq4q5u",
 55 |   "glue_job_script_file_name": "spark_sql_merge_into_iceberg_from_msk_serverless.py",
 56 |   "glue_job_name": "streaming_data_from_msk_serverless_into_iceberg_table",
 57 |   "glue_job_input_arguments": {
 58 |     "--catalog": "job_catalog",
 59 |     "--database_name": "iceberg_demo_db",
 60 |     "--table_name": "iceberg_demo_table",
 61 |     "--primary_key": "name",
 62 |     "--kafka_topic_name": "ev_stream_data",
 63 |     "--starting_offsets_of_kafka_topic": "latest",
 64 |     "--iceberg_s3_path": "s3://glue-iceberg-demo-atq4q5u/iceberg_demo_db",
 65 |     "--lock_table_name": "iceberg_lock",
 66 |     "--aws_region": "us-east-1",
 67 |     "--window_size": "100 seconds",
 68 |     "--extra-jars": "s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar",
 69 |     "--user-jars-first": "true"
 70 |   },
 71 |   "glue_connections_name": "iceberg-connection"
 72 | }
 73 | </pre>
 74 | 
 75 | :information_source: `--primary_key` option should be set by Iceberg table's primary column name.
 76 | 
 77 | :warning: **You should create a S3 bucket for a glue job script and upload the glue job script file into the s3 bucket.**
 78 | 
 79 | At this point you can now synthesize the CloudFormation template for this code.
 80 | 
 81 | <pre>
 82 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
 83 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
 84 | (.venv) $ cdk synth --all
 85 | </pre>
 86 | 
 87 | To add additional dependencies, for example other CDK libraries, just add
 88 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
 89 | command.
 90 | 
 91 | ## Run Test
 92 | 
 93 | 1. Set up **Apache Iceberg connector for AWS Glue** to use Apache Iceberg with AWS Glue jobs.
 94 | 2. Create a S3 bucekt for Apache Iceberg table
 95 |    <pre>
 96 |    (.venv) $ cdk deploy MSKServerlessToIcebergS3Path
 97 |    </pre>
 98 | 3. Create a MSK
 99 |    <pre>
100 |    (.venv) $ cdk deploy MSKServerlessToIcebergStackVpc MSKServerlessAsGlueStreamingJobDataSource
101 |    </pre>
102 | 4. Create a MSK connector for Glue Streaming Job
103 |    <pre>
104 |    (.venv) $ cdk deploy GlueMSKServerlessConnection
105 |    </pre>
106 |    For more information, see [References](#references) (8)
107 | 5. Create a IAM Role for Glue Streaming Job
108 |    <pre>
109 |    (.venv) $ cdk deploy GlueStreamingMSKServerlessToIcebergJobRole
110 |    </pre>
111 | 6. Set up a Kafka Client Machine
112 |    <pre>
113 |    (.venv) $ cdk deploy MSKServerlessClientEC2Instance
114 |    </pre>
115 | 7. Create a Glue Database for an Apache Iceberg table
116 |    <pre>
117 |    (.venv) $ cdk deploy GlueIcebergDatabase
118 |    </pre>
119 | 8. Upload **AWS SDK for Java 2.x** jar file into S3
120 |    <pre>
121 |    (.venv) $ wget https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.17.224/aws-sdk-java-2.17.224.jar
122 |    (.venv) $ aws s3 cp aws-sdk-java-2.17.224.jar s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar
123 |    </pre>
124 |    A Glue Streaming Job might fail because of the following error:
125 |    <pre>
126 |    py4j.protocol.Py4JJavaError: An error occurred while calling o135.start.
127 |    : java.lang.NoSuchMethodError: software.amazon.awssdk.utils.SystemSetting.getStringValueFromEnvironmentVariable(Ljava/lang/String;)Ljava/util/Optional
128 |    </pre>
129 |    We can work around the problem by starting the Glue Job with the additional parameters:
130 |    <pre>
131 |    --extra-jars <i>s3://path/to/aws-sdk-for-java-v2.jar</i>
132 |    --user-jars-first true
133 |    </pre>
134 |    In order to do this, we might need to upload **AWS SDK for Java 2.x** jar file into S3.
135 | 9. Create a Glue Streaming Job
136 | 
137 |    * (step 1) Select one of Glue Job Scripts and upload into S3
138 | 
139 |      **List of Glue Job Scirpts**
140 |      | File name | Spark Writes |
141 |      |-----------|--------------|
142 |      | spark_dataframe_insert_iceberg_from_msk_serverless.py | DataFrame append |
143 |      | spark_sql_insert_overwrite_iceberg_from_msk_serverless.py | SQL insert overwrite |
144 |      | spark_sql_merge_into_iceberg_from_msk_serverless.py | SQL merge into |
145 | 
146 |      <pre>
147 |      (.venv) $ ls src/main/python/
148 |       spark_dataframe_insert_iceberg_from_msk_serverless.py
149 |       spark_sql_insert_overwrite_iceberg_from_msk_serverless.py
150 |       spark_sql_merge_into_iceberg_from_msk_serverless.py
151 |      (.venv) $ aws s3 mb <i>s3://aws-glue-assets-123456789012-atq4q5u</i> --region <i>us-east-1</i>
152 |      (.venv) $ aws s3 cp src/main/python/spark_sql_merge_into_iceberg_from_msk_serverless.py <i>s3://aws-glue-assets-123456789012-atq4q5u/scripts/</i>
153 |      </pre>
154 | 
155 |    * (step 2) Provision the Glue Streaming Job
156 | 
157 |      <pre>
158 |      (.venv) $ cdk deploy GrantLFPermissionsOnGlueJobRole \
159 |                           GlueStreamingJobMSKServerlessToIceberg
160 |      </pre>
161 | 
162 | 10. Create a table with partitioned data in Amazon Athena
163 | 
164 |     Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
165 | 
166 |    * (step 1) Create a database
167 | 
168 |      In order to create a new database called `iceberg_demo_db`, enter the following statement in the Athena query editor
169 |      and click the **Run** button to execute the query.
170 | 
171 |      <pre>
172 |      CREATE DATABASE IF NOT EXISTS iceberg_demo_db
173 |      </pre>
174 | 
175 |    * (step 2) Create a table
176 | 
177 |       Copy the following query into the Athena query editor, replace the `xxxxxxx` in the last line under `LOCATION` with the string of your S3 bucket, and execute the query to create a new table.
178 | 
179 |       <pre>
180 |       CREATE TABLE iceberg_demo_db.iceberg_demo_table (
181 |         name string,
182 |         age int,
183 |         m_time timestamp
184 |       )
185 |       PARTITIONED BY (`name`)
186 |       LOCATION 's3://glue-iceberg-demo-atq4q5u/iceberg_demo_db/iceberg_demo_table'
187 |       TBLPROPERTIES (
188 |         'table_type'='iceberg'
189 |       );
190 |       </pre>
191 |       If the query is successful, a table named `iceberg_demo_table` is created and displayed on the left panel under the **Tables** section.
192 | 
193 |       If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have mydatabase selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**.
194 | 
195 |       :information_source: If you fail to create the table, give Athena users access permissions on `iceberg_demo_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant anyone using Athena to access `iceberg_demo_db` by running the following command:
196 |       <pre>
197 |       (.venv) $ aws lakeformation grant-permissions \
198 |                 --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:user/<i>example-user-id</i> \
199 |                 --permissions CREATE_TABLE DESCRIBE ALTER DROP \
200 |                 --resource '{ "Database": { "Name": "<i>iceberg_demo_db</i>" } }'
201 |       (.venv) $ aws lakeformation grant-permissions \
202 |               --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:user/<i>example-user-id</i> \
203 |               --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
204 |               --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
205 |       </pre>
206 | 
207 | 11. Make sure the glue job to access the Iceberg table in the Glue Catalog database
208 | 
209 |     We can get permissions by running the following command:
210 |     <pre>
211 |     (.venv) $ aws lakeformation list-permissions | jq -r '.PrincipalResourcePermissions[] | select(.Principal.DataLakePrincipalIdentifier | endswith(":role/GlueJobRole-MSKServerless2Iceberg"))'
212 |     </pre>
213 |     If not found, we need manually to grant the glue job to required permissions by running the following command:
214 |     <pre>
215 |     (.venv) $ aws lakeformation grant-permissions \
216 |                 --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>GlueJobRole-MSKServerless2Iceberg</i> \
217 |                 --permissions SELECT DESCRIBE ALTER INSERT DELETE \
218 |                 --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
219 |     </pre>
220 | 
221 | 12. Run glue job to load data from MSK into S3
222 |     <pre>
223 |     (.venv) $ aws glue start-job-run --job-name <i>streaming_data_from_msk_serverless_into_iceberg_table</i>
224 |     </pre>
225 | 
226 | 13. Generate streaming data
227 | 
228 |     1. Connect the MSK client EC2 Host.
229 | 
230 |        You can connect to an EC2 instance using the EC2 Instance Connect CLI.<br/>
231 |        Install `ec2instanceconnectcli` python package and Use the **mssh** command with the instance ID as follows.
232 |        <pre>
233 |        $ sudo pip install ec2instanceconnectcli
234 |        $ mssh ec2-user@<i>i-001234a4bf70dec41EXAMPLE</i>
235 |        </pre>
236 | 
237 |     2. Create an Apache Kafka topic
238 |        After connect your EC2 Host, you use the client machine to create a topic on the cluster.
239 |        Run the following command to create a topic called `ev_stream_data`.
240 |        <pre>
241 |        [ec2-user@ip-172-31-0-180 ~]$ export PATH=$HOME/opt/kafka/bin:$PATH
242 |        [ec2-user@ip-172-31-0-180 ~]$ export BS=<i>{BootstrapBrokerString}</i>
243 |        [ec2-user@ip-172-31-0-180 ~]$ kafka-topics.sh --bootstrap-server $BS \
244 |           --command-config client.properties \
245 |           --create \
246 |           --topic <i>ev_stream_data</i> \
247 |           --partitions 3 \
248 |           --replication-factor 2
249 |        </pre>
250 | 
251 |     3. Produce and consume data
252 | 
253 |        **(1) To produce messages**
254 | 
255 |        Run the following command to generate messages into the topic on the cluster.
256 | 
257 |        <pre>
258 |        [ec2-user@ip-172-31-0-180 ~]$ python3 gen_fake_data.py | kafka-console-producer.sh \
259 |           --bootstrap-server $BS \
260 |           --producer.config client.properties \
261 |           --topic <i>ev_stream_data</i> \
262 |           --property parse.key=true \
263 |           --property key.seperator='\t'
264 |        </pre>
265 | 
266 |        **(2) To consume messages**
267 | 
268 |        Keep the connection to the client machine open, and then open a second, separate connection to that machine in a new window.
269 | 
270 |        <pre>
271 |        [ec2-user@ip-172-31-0-180 ~]$ kafka-console-consumer.sh --bootstrap-server $BS \
272 |           --consumer.config client.properties \
273 |           --topic <i>ev_stream_data</i> \
274 |           --from-beginning
275 |        </pre>
276 | 
277 |        You start seeing the messages you entered earlier when you used the console producer command.
278 |        Enter more messages in the producer window, and watch them appear in the consumer window.
279 | 
280 |     We can synthetically generate data in JSON format using a simple Python application.
281 | 
282 |     Synthentic Data Example order by `name` and `m_time`
283 |     <pre>
284 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
285 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
286 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
287 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
288 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
289 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
290 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
291 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
292 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
293 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
294 |     </pre>
295 | 
296 |     Spark Writes using `DataFrame append` insert all records into the Iceberg table.
297 |     <pre>
298 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
299 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
300 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
301 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
302 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
303 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
304 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
305 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
306 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
307 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
308 |     </pre>
309 | 
310 |     Spark Writes using `SQL insert overwrite` or `SQL merge into` insert the last updated records into the Iceberg table.
311 |     <pre>
312 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
313 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
314 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
315 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
316 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
317 |     </pre>
318 | 
319 | 14. Check streaming data in S3
320 | 
321 |     After `3~5` minutes, you can see that the streaming data have been delivered from **MSK Serverless** to **S3**.
322 | 
323 |     ![iceberg-table](./assets/iceberg-table.png)
324 |     ![iceberg-table](./assets/iceberg-data-level-01.png)
325 |     ![iceberg-table](./assets/iceberg-data-level-02.png)
326 |     ![iceberg-table](./assets/iceberg-data-level-03.png)
327 | 
328 | 15. Run test query
329 | 
330 |     Enter the following SQL statement and execute the query.
331 |     <pre>
332 |     SELECT COUNT(*)
333 |     FROM iceberg_demo_db.iceberg_demo_table;
334 |     </pre>
335 | 
336 | ## Clean Up
337 | 1. Stop the glue streaming job.
338 |    <pre>
339 |    (.venv) $ JOB_RUN_IDS=$(aws glue get-job-runs --job-name <i>streaming_data_from_msk_serverless_into_iceberg_table</i> | jq -r '.JobRuns[] | select(.JobRunState == "RUNNING") | .Id')
340 |    (.venv) $ aws glue batch-stop-job-run --job-name <i>streaming_data_from_msk_serverless_into_iceberg_table</i> --job-run-ids $JOB_RUN_IDS
341 |    </pre>
342 | 2. Delete the CloudFormation stack by running the below command.
343 |    <pre>
344 |    (.venv) $ cdk destroy --all
345 |    </pre>
346 | 
347 | ## Useful commands
348 | 
349 |  * `cdk ls`          list all stacks in the app
350 |  * `cdk synth`       emits the synthesized CloudFormation template
351 |  * `cdk deploy`      deploy this stack to your default AWS account/region
352 |  * `cdk diff`        compare deployed stack with current state
353 |  * `cdk docs`        open CDK documentation
354 | 
355 | ## References
356 | 
357 |  * (1) [AWS Glue versions](https://docs.aws.amazon.com/glue/latest/dg/release-notes.html): The AWS Glue version determines the versions of Apache Spark and Python that AWS Glue supports.
358 |  * (2) [Use the AWS Glue connector to read and write Apache Iceberg tables with ACID transactions and perform time travel \(2022-06-21\)](https://aws.amazon.com/ko/blogs/big-data/use-the-aws-glue-connector-to-read-and-write-apache-iceberg-tables-with-acid-transactions-and-perform-time-travel/)
359 |  * (3) [Spark Stream Processing with Amazon EMR using Apache Kafka streams running in Amazon MSK (2022-06-30)](https://yogender027mae.medium.com/spark-stream-processing-with-amazon-emr-using-apache-kafka-streams-running-in-amazon-msk-9776036c18d9)
360 |    * [yogenderPalChandra/AmazonMSK-EMR-tem-data](https://github.com/yogenderPalChandra/AmazonMSK-EMR-tem-data) - This is repo for medium article "Spark Stream Processing with Amazon EMR using Kafka streams running in Amazon MSK"
361 |  * (4) [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html)
362 |  * (5) [Streaming ETL jobs in AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html)
363 |  * (6) [AWS Glue job parameters](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html)
364 |  * (7) [Connection types and options for ETL in AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-connect.html#aws-glue-programming-etl-connect-kafka)
365 |  * (8) [Creating an AWS Glue connection for an Apache Kafka data stream](https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html#create-conn-streaming)
366 |  * (9) [Crafting serverless streaming ETL jobs with AWS Glue (2020-10-14)](https://aws.amazon.com/ko/blogs/big-data/crafting-serverless-streaming-etl-jobs-with-aws-glue/)
367 |  * (10) [Actions, resources, and condition keys for Apache Kafka APIs for Amazon MSK clusters](https://docs.aws.amazon.com/service-authorization/latest/reference/list_apachekafkaapisforamazonmskclusters.html)
368 |  * (11) [Apache Iceberg - Spark Writes with SQL (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-writes/)
369 |  * (12) [Apache Iceberg - Spark Structured Streaming (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/)
370 |  * (13) [Apache Iceberg - Writing against partitioned table (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table)
371 |    * Iceberg supports append and complete output modes:
372 |      * `append`: appends the rows of every micro-batch to the table
373 |      * `complete`: replaces the table contents every micro-batch
374 | 
375 |        Iceberg requires the data to be sorted according to the partition spec per task (Spark partition) in prior to write against partitioned table.<br/>
376 |        Otherwise, you might encounter the following error:
377 |        <pre>
378 |        pyspark.sql.utils.AnalysisException: Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets;
379 |        </pre>
380 |  * (14) [Apache Iceberg - Maintenance for streaming tables (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#maintenance-for-streaming-tables)
381 |  * (15) [awsglue python package](https://github.com/awslabs/aws-glue-libs): The awsglue Python package contains the Python portion of the AWS Glue library. This library extends PySpark to support serverless ETL on AWS.
382 |  * (15) [AWS Glue Notebook Samples](https://github.com/aws-samples/aws-glue-samples/tree/master/examples/notebooks) - sample iPython notebook files which show you how to use open data dake formats; Apache Hudi, Delta Lake, and Apache Iceberg on AWS Glue Interactive Sessions and AWS Glue Studio Notebook.
383 | 
384 | ## Troubleshooting
385 | 
386 |  * Granting database or table permissions error using AWS CDK
387 |    * Error message:
388 |      <pre>
389 |      AWS::LakeFormation::PrincipalPermissions | CfnPrincipalPermissions Resource handler returned message: "Resource does not exist or requester is not authorized to access requested permissions. (Service: LakeFormation, Status Code: 400, Request ID: f4d5e58b-29b6-4889-9666-7e38420c9035)" (RequestToken: 4a4bb1d6-b051-032f-dd12-5951d7b4d2a9, HandlerErrorCode: AccessDenied)
390 |      </pre>
391 |    * Solution:
392 | 
393 |      The role assumed by cdk is not a data lake administrator. (e.g., `cdk-hnb659fds-deploy-role-12345678912-us-east-1`) <br/>
394 |      So, deploying PrincipalPermissions meets the error such as:
395 | 
396 |      `Resource does not exist or requester is not authorized to access requested permissions.`
397 | 
398 |      In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.<br/>
399 |      For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
400 | 
401 |    * Reference:
402 | 
403 |      [https://github.com/aws-samples/data-lake-as-code](https://github.com/aws-samples/data-lake-as-code) - Data Lake as Code
404 | 
405 | 


--------------------------------------------------------------------------------
/msk-to-iceberg/glue-streaming-data-from-kafka-to-iceberg-table.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="761px" height="320px" viewBox="-0.5 -0.5 761 320" content="&lt;mxfile host=&quot;Electron&quot; modified=&quot;2023-01-24T15:57:38.373Z&quot; agent=&quot;5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.2.2 Chrome/83.0.4103.100 Electron/9.0.3 Safari/537.36&quot; etag=&quot;qkamL7T02fv1YyfIJ8S7&quot; version=&quot;13.2.2&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;KxS7Oj1AeW9TW_wmeesr&quot; name=&quot;Page-1&quot;&gt;7Vpbd6M2EP41ftwcQFwffd+0STY93iZtX3xkkEEbQC6Wb/n1lUBgg3Cck2J2nSTxSdBIjKSZ75sZgTugH23HCVwEt8RDYUdTvG0HDDqapiqOxv5xyS6TODbIBH6CPTFoL5jgZ5TfKaQr7KFlaSAlJKR4URa6JI6RS0symCRkUx42J2F51gX0kSSYuDCUpY/Yo4GQqqaz7/iKsB+IqW3NyjoimA8WO1kG0CObAxEYdkA/IYRmV9G2j0JuvNwu2X2jI73FwhIU09fc0N/5Clos1sa1RuKx+ufOvpt9UcU21jBciR13HydM0A/JyhMLp7vcGguCY5pa1OixD5uwr3QM1tPnrSvNqAiqbassUOUW11EWVNtWWaBW1auV+dXqAg8EUqukXqnMrxwskH1Aj6xoiGPUL7CnMKGfQA8zn/RJSBImi0nMrNcLaBSylsouNwGmaLKALrfqhvGGyeYkpgL9qpa3heG5VoaeBb+Otj4n2hXcLPUrPyGrRTrlNcN/be+UXU7d1JlMCU3IE8oX1tEA+x1xtPTmOAwrC16jhGJGhG6Ifa6bEj4VFK0QzSnXyHaBY/8mbQ2AIlZeN4UHlwHyxHZk8Ao881nR9kAkwDxGJEI02bEhohfYglgisqimaG/2PDXzMBIcUBTkoIciNviF7j192IVgUD2bnqOH6Pu3v//4tp4vI/vh9q/B3fqLJpHpPsFrSBFn0WoWI/rJqMtn1BK5qwTT3XQ/eJISK1dcZZmqW8Net8IyJh+aI21kN0e1Yp4aqkm8qmHfcarlWVxQDQCZaiqooVohbJxqqkQ1iVq1WDoWnaoYY32ObgxG2kHfACdMEU5xEZOE77zqUqMLlJ5RB4J5+lN1Tu75GzhD4T1ZYqF+Rigl0UlouGxVKClz4RTu4XKRmWOOt8g7RoQELckqcVFGgx5r1hEigjGrf7wp2y2CEUPndE6S6ROcP8FmkKfnsTpHXk2Qt2wZeLns/+BueX1z+3C3Gc/+nXWDr48D7bfnf86OOxHbXgcrMfiXQ1Sa1FAyXKMst6lHwy2z4tSDFE5d9ickfjOlgQXK8aqowQ9Ro7WIGrkwGPMrTRmwbfNqW+y+CiW2X1r2heSnqjsj7Hn8dk5Z/AxnhZfKlQZPtnBFGT4yvzZkeFUvG96x5URRV5NpDeSJ+gMOOE1Y5LEjn2jysE58EsNwuJf2WKKPvYJi+zE3hJMntd4PROlOsIQbtuw2FHtdfhrlRAyJ+/Q9wHEmHuEwPOUAvsAXzZ+gEFK8Lp9Z62wpbr3nWNi7zbCscpR1KhUyhYmPqLjr8IxZUaQ7JxRlSUVSlLq22M/beWa+A2dnJnoB0mJPmU9Op6pzg8fRzDLnbeVV4GEmgruDYSI+HZ8HqOV5DKcCoUzjWwH1kq0/y8yfWmZCGqAYNpOibLsSolqsKOszlPERghZoJxhZwHhTJmsqAeW1zYdw5q+SgUy1nQwkzdNGBnpF/fqZgc6egfzU+k3kH6nW/tn5R5cAdjv5XcLYpZ1EdWCXzFy0D8yst3kQlbO8eAQwyZ+dXbzNDdM8aXNVadPo8nmwm5WSl25qR1FPmtpo9TmL9REKH6udgsa2y0Q6YxX70jZbfsitWZbKKqx395B7tnKfEJ1uMA2mZPaDaVk29Zi7HARqSwmjxVLClnAzARcfai0DXJUPlbou27nu7ef5gu0r3kJdfrB12gm2uupcqbajGLqlmcC0K5xyKmQ5c+iVv4/VSujNh7+z0OuGOIVdE3GAHf9PR9u6g5t5riggh9v7HSNx3CneK45RjBJImTMvPQoXAbYoeE3J+mpDBzrW3H8tMyP2/sutYPgf&lt;/diagram&gt;&lt;/mxfile&gt;"><defs><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-945df2-1-5a30b5-1-s-0"><stop offset="0%" style="stop-color:#5A30B5"/><stop offset="100%" style="stop-color:#945DF2"/></linearGradient></defs><g><path d="M 150 0 L 760 0 L 760 319 L 150 319 Z" fill="none" stroke="#232f3e" stroke-miterlimit="10" pointer-events="all"/><path d="M 160.73 6.89 C 158.19 6.87 156.05 9.34 156.28 11.84 C 154.63 12.3 153.57 14.09 153.93 15.76 C 154.17 17.45 155.84 18.8 157.56 18.58 C 161.3 18.56 165.04 18.61 168.79 18.55 C 170.72 18.38 172.05 16.29 171.58 14.45 C 171.34 13.13 170.21 11.98 168.87 11.81 C 168.86 10.3 167.26 8.82 165.79 9.6 C 164.97 10.39 164.82 8.44 164.02 8.13 C 163.14 7.28 161.95 6.83 160.73 6.89 Z M 160.75 7.63 C 162.69 7.51 164.32 9.05 164.91 10.79 C 165.31 11.19 165.67 10.53 165.97 10.34 C 167.25 9.57 168.21 11.17 168.22 12.29 C 168.58 12.77 169.39 12.38 169.77 12.91 C 171.21 13.93 171.31 16.3 169.88 17.37 C 168.9 18.09 167.63 17.75 166.5 17.83 C 163.28 17.82 160.06 17.84 156.84 17.82 C 155.21 17.58 154.19 15.69 154.79 14.17 C 155.07 13.16 156.05 12.67 156.94 12.36 C 157.19 11.62 156.85 10.71 157.33 9.98 C 157.92 8.65 159.25 7.61 160.75 7.63 Z M 150 25 C 150 16.67 150 8.33 150 0 C 158.33 0 166.67 0 175 0 C 175 8.33 175 16.67 175 25 C 166.67 25 158.33 25 150 25 Z" fill="#232f3e" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 578px; height: 1px; padding-top: 7px; margin-left: 182px;"><div style="box-sizing: border-box; font-size: 0; text-align: left; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #232F3E; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">AWS Cloud</div></div></div></foreignObject><text x="182" y="19" fill="#232F3E" font-family="Helvetica" font-size="12px">AWS Cloud</text></switch></g><path d="M 162 170 L 292 170 L 292 300 L 162 300 Z" fill="#e6f2f8" stroke="none" pointer-events="all"/><path d="M 162 170 L 187 170 L 187 195 L 162 195 Z M 174.52 173.21 C 173.4 173.21 172.31 173.62 171.49 174.39 C 170.67 175.11 170.21 176.15 170.2 177.24 L 170.2 179.78 L 167.89 179.78 C 167.8 179.78 167.7 179.82 167.64 179.89 C 167.57 179.95 167.54 180.04 167.54 180.14 L 167.54 191.43 C 167.54 191.63 167.7 191.79 167.89 191.79 L 181.11 191.79 C 181.3 191.79 181.46 191.63 181.46 191.43 L 181.46 180.15 C 181.47 180.06 181.43 179.97 181.36 179.9 C 181.3 179.83 181.21 179.79 181.11 179.79 L 178.81 179.79 L 178.81 177.29 C 178.81 176.21 178.35 175.18 177.56 174.44 C 176.74 173.65 175.65 173.22 174.52 173.21 Z M 174.51 173.93 C 175.46 173.92 176.37 174.28 177.06 174.93 C 177.72 175.54 178.1 176.4 178.1 177.29 L 178.1 179.79 L 170.88 179.79 L 170.89 177.26 C 170.9 176.36 171.28 175.51 171.95 174.91 C 172.65 174.27 173.57 173.92 174.51 173.93 Z M 168.24 180.5 L 180.76 180.5 L 180.75 191.07 L 168.24 191.07 Z M 174.51 182.74 C 173.48 182.73 172.61 183.51 172.51 184.53 C 172.42 185.56 173.13 186.48 174.14 186.66 L 174.14 189.44 L 174.86 189.44 L 174.86 186.66 C 175.79 186.49 176.47 185.67 176.48 184.72 C 176.48 183.63 175.6 182.75 174.51 182.74 Z M 174.39 183.46 C 174.43 183.46 174.47 183.46 174.51 183.46 C 174.84 183.46 175.16 183.59 175.4 183.83 C 175.64 184.07 175.77 184.39 175.76 184.72 C 175.77 185.06 175.64 185.38 175.4 185.61 C 175.16 185.85 174.84 185.98 174.51 185.98 C 174.04 186.03 173.58 185.81 173.32 185.42 C 173.06 185.03 173.04 184.53 173.26 184.12 C 173.49 183.7 173.92 183.45 174.39 183.46 Z" fill="#147eba" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 98px; height: 1px; padding-top: 177px; margin-left: 194px;"><div style="box-sizing: border-box; font-size: 0; text-align: left; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #147EBA; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Private subnet</div></div></div></foreignObject><text x="194" y="189" fill="#147EBA" font-family="Helvetica" font-size="12px">Private subnet</text></switch></g><path d="M 189 200 L 267 200 L 267 278 L 189 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 226.69 207.8 C 224.83 207.8 223.13 208.86 222.29 210.52 C 221.46 212.19 221.64 214.18 222.75 215.68 L 210.63 234.01 C 210.06 233.79 209.44 233.68 208.82 233.68 C 206.26 233.68 204.12 235.64 203.91 238.19 C 203.69 240.75 205.47 243.04 207.99 243.47 C 210.52 243.9 212.95 242.33 213.59 239.84 L 239.79 239.84 C 240.44 242.33 242.87 243.9 245.4 243.47 C 247.92 243.04 249.7 240.75 249.48 238.19 C 249.26 235.64 247.13 233.68 244.56 233.68 C 243.95 233.68 243.34 233.79 242.77 234.01 L 230.66 215.65 C 231.76 214.15 231.93 212.16 231.09 210.51 C 230.25 208.85 228.55 207.8 226.69 207.8 Z M 226.69 210.26 C 227.69 210.26 228.59 210.87 228.97 211.79 C 229.35 212.71 229.14 213.77 228.44 214.47 C 227.73 215.18 226.67 215.39 225.75 215.01 C 224.83 214.63 224.23 213.73 224.23 212.73 C 224.23 211.37 225.33 210.26 226.69 210.26 Z M 228.74 217.2 L 240.77 235.46 C 240.29 236.03 239.96 236.7 239.79 237.43 L 239.79 237.38 L 213.59 237.38 C 213.41 236.68 213.07 236.03 212.61 235.47 L 224.68 217.22 C 225.31 217.51 226 217.66 226.69 217.66 C 227.4 217.66 228.1 217.51 228.74 217.2 Z M 223.22 223.81 L 223.22 234.91 L 225.46 234.91 L 225.46 229.55 L 228.6 234.91 L 231.27 234.91 L 227.46 229.01 L 230.97 223.81 L 228.49 223.81 L 225.46 228.75 L 225.46 223.81 Z M 208.72 236.15 C 209.39 236.12 210.05 236.37 210.53 236.83 C 211.02 237.3 211.29 237.94 211.29 238.61 C 211.29 239.97 210.18 241.08 208.82 241.08 C 207.46 241.1 206.34 240.02 206.31 238.66 C 206.28 237.3 207.36 236.17 208.72 236.15 Z M 244.47 236.15 C 245.14 236.12 245.79 236.37 246.27 236.83 C 246.76 237.3 247.03 237.94 247.03 238.61 C 247.03 239.27 246.77 239.89 246.31 240.35 C 245.85 240.82 245.22 241.08 244.56 241.08 C 243.2 241.1 242.08 240.02 242.05 238.66 C 242.02 237.3 243.1 236.17 244.47 236.15 Z M 214.95 245.19 C 212.79 245.27 210.72 246.03 209.02 247.36 L 207.32 248.79 C 204.58 250.91 200.76 250.91 198.01 248.79 L 197.2 248.1 L 197.2 251.27 C 200.87 253.56 205.58 253.32 208.98 250.65 L 210.68 249.24 C 213.42 247.11 217.24 247.11 219.98 249.24 L 221.68 250.65 C 225.38 253.56 230.59 253.56 234.3 250.65 L 236.02 249.24 C 238.76 247.11 242.58 247.11 245.32 249.24 L 247.02 250.65 C 248.79 252.08 251 252.84 253.28 252.81 C 255.23 252.84 257.16 252.28 258.8 251.22 L 258.8 248.1 L 258.8 248.09 L 257.95 248.79 C 255.21 250.91 251.39 250.91 248.64 248.79 L 246.96 247.36 C 243.25 244.47 238.04 244.47 234.33 247.36 L 232.63 248.79 C 229.89 250.91 226.07 250.91 223.33 248.79 L 221.64 247.36 C 219.74 245.87 217.36 245.1 214.95 245.19 Z M 214.99 254.01 C 212.82 254.08 210.73 254.84 209.02 256.19 L 207.32 257.62 C 204.57 259.72 200.76 259.72 198.01 257.62 L 197.2 256.94 L 197.2 260.11 C 200.87 262.39 205.57 262.14 208.98 259.49 L 210.68 258.05 C 213.42 255.94 217.24 255.94 219.98 258.05 L 221.67 259.49 C 225.38 262.39 230.59 262.39 234.3 259.49 L 236.02 258.05 C 238.76 255.94 242.58 255.94 245.32 258.05 L 247.02 259.49 C 248.79 260.92 251 261.69 253.28 261.66 C 255.24 261.68 257.16 261.12 258.8 260.06 L 258.8 256.94 L 258.8 256.9 L 257.95 257.62 C 255.2 259.72 251.39 259.72 248.64 257.62 L 246.96 256.19 C 243.25 253.27 238.04 253.27 234.33 256.19 L 232.63 257.62 C 229.89 259.72 226.07 259.72 223.33 257.62 L 221.64 256.19 C 219.75 254.7 217.39 253.92 214.99 254.01 Z M 214.99 261.82 C 212.82 261.89 210.73 262.66 209.02 264 L 207.32 265.43 C 204.57 267.54 200.76 267.54 198.01 265.43 L 197.2 264.74 L 197.2 267.91 C 200.87 270.2 205.58 269.95 208.98 267.29 L 210.68 265.87 C 213.42 263.74 217.25 263.74 219.98 265.87 L 221.67 267.29 C 225.37 270.2 230.59 270.2 234.3 267.29 L 236.02 265.87 C 238.75 263.74 242.58 263.74 245.32 265.87 L 247.02 267.29 C 248.79 268.73 251 269.5 253.28 269.47 C 255.23 269.5 257.16 268.95 258.8 267.88 L 258.8 264.74 L 258.8 264.73 L 257.95 265.43 C 255.2 267.54 251.39 267.54 248.64 265.43 L 246.96 264 C 243.25 261.09 238.04 261.09 234.33 264 L 232.63 265.43 C 229.89 267.54 226.07 267.54 223.33 265.43 L 221.64 264 C 219.75 262.51 217.39 261.74 214.99 261.82 Z" fill="#ffffff" stroke="none" pointer-events="all"/><rect x="502" y="60" width="72" height="78" fill="none" stroke="none" pointer-events="all"/><path d="M 507.02 60 C 504.32 60.01 502.13 62.2 502.12 64.9 L 502.12 133.13 C 502.12 135.82 504.3 138 506.99 138 L 511.48 138 L 511.48 134.88 L 507.02 134.88 C 506.55 134.88 506.09 134.69 505.76 134.36 C 505.43 134.03 505.24 133.57 505.24 133.1 L 505.24 64.88 C 505.25 63.91 506.03 63.13 507 63.12 L 511.48 63.12 L 511.48 60 Z M 516.16 60 C 515.3 60 514.6 60.7 514.6 61.56 L 514.6 136.44 C 514.6 137.3 515.3 138 516.16 138 L 572.32 138 C 573.18 138 573.88 137.3 573.88 136.44 L 573.88 61.56 C 573.88 60.7 573.18 60 572.32 60 Z M 517.72 63.12 L 570.76 63.12 L 570.76 134.88 L 517.72 134.88 Z M 523.96 67.8 L 523.96 70.92 L 566.08 70.92 L 566.08 67.8 Z M 523.96 74.04 L 523.96 77.16 L 566.08 77.16 L 566.08 74.04 Z M 544.32 80.51 C 541.42 80.51 527.16 80.75 527.16 85.68 C 527.16 86.3 527.39 86.85 527.8 87.33 L 539.5 110.53 L 539.5 115.91 C 539.5 116.44 539.77 116.93 540.22 117.22 C 541.41 118.01 542.81 118.45 544.24 118.5 C 545.67 118.47 547.06 118.01 548.23 117.19 C 548.66 116.9 548.92 116.42 548.92 115.91 L 548.92 110.47 L 560.48 87.66 C 561.09 87.1 561.48 86.46 561.48 85.68 C 561.48 80.76 547.22 80.51 544.32 80.51 Z M 544.32 83.63 C 551.9 83.63 556.58 84.85 558 85.68 L 557.98 85.68 C 556.58 86.52 551.87 87.74 544.32 87.74 C 536.75 87.74 532.06 86.52 530.64 85.68 C 532.06 84.85 536.74 83.63 544.32 83.63 Z M 532.46 89.64 C 536.85 90.76 542.62 90.86 544.32 90.86 C 545.99 90.86 551.6 90.76 555.96 89.69 L 545.97 109.39 C 545.86 109.61 545.8 109.85 545.8 110.09 L 545.8 114.97 C 544.83 115.51 543.65 115.51 542.68 114.97 L 542.68 110.15 C 542.68 109.91 542.62 109.67 542.51 109.45 Z M 523.96 122.4 L 523.96 125.52 L 566.08 125.52 L 566.08 122.4 Z M 523.96 128.64 L 523.96 131.76 L 566.08 131.76 L 566.08 128.64 Z" fill="#5a30b5" stroke="none" pointer-events="all"/><rect x="484" y="138" width="110" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 148px; margin-left: 539px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Glue Data Catalog</div></div></div></foreignObject><text x="539" y="152" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Glue Data Catalog</text></switch></g><path d="M 267 239 L 338.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 345.88 239 L 338.88 241.33 L 338.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 701 200 L 701 99 L 582 99" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 575 99 L 582 96.67 L 582 101.33 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 657 200 L 735 200 L 735 278 L 657 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 694.81 207.83 C 682.99 207.83 672.76 214.71 667.92 224.7 L 670.09 225.76 C 674.52 216.59 683.92 210.26 694.81 210.26 C 709.98 210.26 722.26 222.54 722.26 237.7 C 722.26 243.14 720.69 248.18 717.97 252.44 L 713.25 247.67 C 714.69 244.87 715.51 241.71 715.51 238.34 C 715.51 227.04 706.31 217.84 694.99 217.84 C 686.09 217.84 678.51 223.54 675.68 231.48 L 667.31 231.44 L 667.3 233.87 L 674.98 233.91 C 674.71 235.13 674.54 236.39 674.5 237.68 L 664.81 237.62 L 664.8 240.05 L 674.57 240.11 C 674.68 241.42 674.92 242.68 675.26 243.89 L 667.37 243.97 L 667.4 246.4 L 676.11 246.32 C 679.22 253.68 686.52 258.86 694.99 258.86 C 698.67 258.86 702.12 257.87 705.1 256.18 L 709.65 260.8 C 705.38 263.55 700.28 265.15 694.81 265.15 C 685.25 265.15 676.84 260.27 671.93 252.88 L 669.92 254.23 C 675.28 262.28 684.44 267.58 694.81 267.58 C 700.94 267.58 706.64 265.73 711.38 262.55 L 716.73 267.96 L 716.73 267.97 C 717.87 269.09 719.27 269.87 720.8 270.02 C 722.33 270.17 723.94 269.66 725.29 268.46 L 725.33 268.42 L 725.37 268.38 C 726.58 267.12 727.2 265.6 727.16 264.09 C 727.13 262.58 726.46 261.13 725.36 259.92 L 725.35 259.91 L 719.71 254.2 C 722.85 249.47 724.69 243.8 724.69 237.7 C 724.69 221.22 711.3 207.83 694.81 207.83 Z M 694.99 220.27 C 705 220.27 713.08 228.35 713.08 238.34 C 713.08 248.35 705 256.43 694.99 256.43 C 685 256.43 676.92 248.35 676.92 238.34 C 676.92 228.35 685 220.27 694.99 220.27 Z M 694.86 226.58 C 692.16 226.58 689.62 226.96 687.67 227.56 C 686.69 227.86 685.88 228.21 685.2 228.66 C 684.53 229.12 683.87 229.76 683.87 230.75 L 683.87 230.83 L 685.9 247.08 L 686.04 247.31 C 686.85 248.62 688.28 249.13 689.77 249.49 C 691.27 249.84 692.92 249.96 694.37 249.96 L 694.38 249.96 C 697.54 249.96 700.57 249.5 702.4 247.86 L 702.73 247.57 L 702.79 247.13 L 703.53 241.88 C 703.72 241.92 703.92 241.95 704.11 241.97 C 704.5 242.01 704.86 242.02 705.28 241.94 C 705.49 241.9 705.73 241.83 706.01 241.64 C 706.29 241.44 706.58 241 706.61 240.56 L 706.61 240.55 L 706.61 240.54 C 706.64 239.89 706.38 239.49 706.14 239.15 C 705.9 238.82 705.62 238.53 705.33 238.25 C 704.94 237.89 704.54 237.58 704.17 237.28 L 705.06 230.95 L 705.06 230.86 C 705.06 229.85 704.41 229.25 703.76 228.8 C 703.12 228.34 702.32 227.97 701.41 227.65 C 699.56 227.02 697.22 226.58 694.86 226.58 Z M 694.86 229.01 C 696.91 229.01 699.06 229.41 700.61 229.95 C 701.21 230.16 701.71 230.39 702.08 230.61 C 700.73 231.3 698.39 231.84 694.31 231.84 L 694.3 231.84 L 694.28 231.85 C 691.06 231.9 688.36 231.56 686.76 230.56 C 687.14 230.34 687.69 230.1 688.38 229.89 C 690.03 229.38 692.38 229.01 694.86 229.01 Z M 702.29 233.17 L 701.52 238.7 C 699.68 238.02 697.98 237.19 696.1 236.25 C 695.84 235.52 695.15 235.04 694.37 235.04 C 693.36 235.04 692.53 235.86 692.53 236.88 C 692.53 237.89 693.36 238.71 694.37 238.71 C 694.66 238.71 694.94 238.65 695.2 238.52 C 697.17 239.51 699.04 240.43 701.17 241.18 L 700.47 246.19 C 699.49 246.87 697.05 247.53 694.37 247.53 C 693.06 247.53 691.55 247.41 690.34 247.12 C 689.18 246.85 688.46 246.38 688.24 246.1 L 686.62 233.26 C 688.79 234.19 691.49 234.32 694.31 234.27 C 697.93 234.27 700.48 233.89 702.29 233.17 Z M 711.98 249.83 L 723.56 261.56 C 724.35 262.42 724.71 263.31 724.73 264.14 C 724.75 264.97 724.45 265.8 723.64 266.66 C 722.74 267.45 721.89 267.68 721.04 267.6 C 720.17 267.52 719.27 267.06 718.44 266.24 L 707.18 254.83 C 709.05 253.45 710.67 251.76 711.98 249.83 Z" fill="#ffffff" stroke="none" pointer-events="all"/><path d="M 425 239 L 496.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 503.88 239 L 496.88 241.33 L 496.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 386 200 L 386 99 L 494 99" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 501 99 L 494 101.33 L 494 96.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 347 200 L 425 200 L 425 278 L 347 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 385.92 207.8 C 385.25 207.8 384.71 208.34 384.71 209.01 L 384.71 214.04 C 384.71 214.71 385.25 215.25 385.92 215.25 L 390.93 215.25 C 391.6 215.25 392.14 214.71 392.14 214.04 L 392.14 209.01 C 392.14 208.34 391.6 207.8 390.93 207.8 Z M 387.14 210.23 L 389.71 210.23 L 389.71 212.82 L 387.14 212.82 Z M 375.94 214.01 C 375.27 214.01 374.73 214.55 374.73 215.22 L 374.73 220.25 C 374.73 220.92 375.27 221.46 375.94 221.46 L 380.95 221.46 C 381.62 221.46 382.16 220.92 382.16 220.25 L 382.16 215.22 C 382.16 214.55 381.62 214.01 380.95 214.01 Z M 377.16 216.44 L 379.73 216.44 L 379.73 219.03 L 377.16 219.03 Z M 388.5 219.01 C 387.83 219.01 387.28 219.56 387.28 220.23 L 387.28 225.25 C 387.28 225.92 387.83 226.46 388.5 226.46 L 393.5 226.46 C 394.17 226.46 394.71 225.92 394.71 225.25 L 394.71 220.23 C 394.71 219.56 394.17 219.01 393.5 219.01 Z M 389.71 221.44 L 392.29 221.44 L 392.29 224.03 L 389.71 224.03 Z M 384.6 222.76 C 380.82 222.76 376.47 223.07 372.99 223.63 C 371.25 223.91 369.75 224.24 368.58 224.66 C 368 224.86 367.49 225.08 367.06 225.41 C 366.62 225.73 366.12 226.27 366.12 227.05 C 366.12 227.41 366.23 227.75 366.39 228.03 L 366.43 228.12 C 366.46 228.21 366.51 228.3 366.56 228.38 L 381.05 250.5 L 381.01 253.89 C 381 254.36 381.27 254.8 381.7 255.01 C 382.77 255.51 384.2 255.75 385.75 255.8 C 387.28 255.84 388.9 255.67 390.27 255 C 390.69 254.78 390.95 254.35 390.95 253.89 L 390.92 250.55 L 405.3 228.39 C 405.34 228.33 405.37 228.26 405.4 228.19 L 405.41 228.17 C 405.68 227.79 405.88 227.33 405.88 226.81 C 405.88 225.94 405.27 225.41 404.77 225.1 C 404.26 224.8 403.69 224.59 403 224.39 C 401.64 224 399.85 223.7 397.82 223.46 C 397.61 223.44 397.39 223.41 397.16 223.39 L 396.89 225.8 C 397.1 225.83 397.33 225.85 397.53 225.88 C 399.5 226.11 401.2 226.4 402.34 226.72 C 402.69 226.83 402.9 226.92 403.1 227.02 C 402.9 227.13 402.63 227.26 402.29 227.4 C 401.24 227.78 399.6 228.12 397.74 228.37 C 394.01 228.86 389.31 229.03 385.78 229.06 C 383.17 229.06 378.58 228.87 374.71 228.39 C 372.78 228.14 371.02 227.82 369.84 227.44 C 369.49 227.34 369.21 227.22 368.99 227.11 C 369.1 227.07 369.22 227.01 369.4 226.94 C 370.31 226.61 371.73 226.29 373.38 226.03 C 376.68 225.5 380.96 225.19 384.6 225.19 L 384.61 225.19 L 384.6 222.76 Z M 370.64 230.16 C 371.78 230.43 373.05 230.63 374.41 230.8 C 378.43 231.3 383.07 231.49 385.78 231.49 C 385.79 231.49 385.79 231.49 385.79 231.49 C 389.38 231.46 394.13 231.3 398.06 230.78 C 399.18 230.63 400.24 230.45 401.2 230.23 L 388.67 249.53 C 388.55 249.73 388.48 249.96 388.48 250.21 L 388.52 252.93 C 387.73 253.17 386.83 253.4 385.81 253.37 C 384.82 253.34 384.08 253.13 383.44 252.94 L 383.47 250.16 C 383.48 249.91 383.42 249.68 383.28 249.48 Z M 382.28 257.61 L 382.28 261.39 L 378.31 261.39 C 377.17 261.4 376.66 262.82 377.54 263.55 L 385.18 269.83 C 385.63 270.2 386.28 270.19 386.73 269.82 L 394.27 263.54 C 395.13 262.81 394.62 261.39 393.49 261.39 L 389.63 261.39 L 389.63 257.61 L 387.2 257.61 L 387.2 262.6 C 387.2 263.27 387.75 263.82 388.42 263.82 L 390.13 263.82 L 385.95 267.31 L 381.71 263.82 L 383.49 263.82 C 384.16 263.82 384.71 263.27 384.71 262.6 L 384.71 257.61 Z" fill="#ffffff" stroke="none" pointer-events="all"/><rect x="208" y="278" width="40" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 228px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">MSK</div></div></div></foreignObject><text x="228" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">MSK</text></switch></g><rect x="336" y="278" width="100" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 386px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Glue Streaming</div></div></div></foreignObject><text x="386" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Glue Streaming</text></switch></g><rect x="671" y="278" width="50" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 696px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Athena</div></div></div></foreignObject><text x="696" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Athena</text></switch></g><path d="M 576 239.1 L 616 239.1 L 616 239 L 647.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 654.88 239 L 647.88 241.33 L 647.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="501" y="200" width="75" height="78" fill="none" stroke="none" pointer-events="all"/><path d="M 538.02 200.07 C 529.81 200.07 521.26 201 514.44 202.84 C 511.02 203.76 508.04 204.89 505.7 206.32 C 503.37 207.75 501.61 209.52 501.05 211.8 C 501.01 211.98 501 212.16 501.03 212.35 L 508.65 269.35 C 508.65 269.37 508.66 269.39 508.66 269.4 C 509.03 271.56 510.78 273.01 512.85 274.06 C 514.91 275.12 517.47 275.86 520.17 276.42 C 525.56 277.56 531.46 277.93 534.59 277.93 C 541.45 277.93 547.81 277.42 552.87 276.31 C 557.94 275.21 561.85 273.67 563.59 270.73 C 563.69 270.56 563.76 270.37 563.79 270.16 L 567.08 245.92 C 567.73 246.08 568.36 246.23 568.94 246.36 C 570.64 246.71 572.04 246.9 573.28 246.74 C 573.9 246.66 574.53 246.49 575.09 246.04 C 575.65 245.58 576 244.77 576 244.06 C 576 242.12 574.66 240.79 573.14 239.52 C 571.77 238.37 570.08 237.29 568.38 236.3 L 571.65 212.22 C 571.68 212 571.66 211.78 571.6 211.58 C 570.85 209.17 568.79 207.41 566.22 205.99 C 563.66 204.58 560.48 203.48 557.11 202.61 C 550.37 200.86 542.85 200.07 538.02 200.07 Z M 538.02 203.08 C 542.51 203.08 549.93 203.86 556.36 205.51 C 559.57 206.34 562.55 207.4 564.77 208.62 C 566.82 209.76 568.08 211.02 568.57 212.19 C 568.17 213.35 567.03 214.51 565.12 215.57 C 562.97 216.78 559.96 217.78 556.63 218.54 C 549.97 220.07 542.02 220.66 536.68 220.66 C 531.12 220.66 523.02 220.04 516.25 218.52 C 512.87 217.76 509.81 216.77 507.62 215.58 C 505.66 214.52 504.5 213.36 504.09 212.24 C 504.46 211.15 505.46 209.99 507.27 208.88 C 509.23 207.68 511.98 206.61 515.21 205.74 C 521.68 204 530.04 203.08 538.02 203.08 Z M 504.72 217.32 C 505.18 217.64 505.68 217.94 506.2 218.22 C 508.78 219.62 512.05 220.65 515.59 221.45 C 522.69 223.04 530.9 223.66 536.68 223.66 C 542.24 223.66 550.3 223.07 557.3 221.47 C 560.8 220.67 564.03 219.62 566.59 218.19 C 567.05 217.93 567.5 217.66 567.92 217.37 L 564.57 242.1 C 562.32 241.42 559.8 240.58 557.15 239.66 C 553.96 238.54 545.99 234.99 539.74 232.02 C 539.73 230.12 538.14 228.55 536.24 228.55 C 534.33 228.55 532.75 230.13 532.75 232.04 C 532.75 233.95 534.33 235.53 536.24 235.53 C 537.07 235.53 537.84 235.23 538.45 234.73 C 544.75 237.73 552.59 241.24 556.15 242.5 C 556.15 242.5 556.15 242.5 556.16 242.5 C 559.02 243.49 561.72 244.39 564.16 245.11 L 560.87 269.3 C 559.93 270.73 556.87 272.37 552.23 273.38 C 547.49 274.41 541.3 274.92 534.58 274.92 C 531.7 274.92 525.87 274.55 520.78 273.49 C 518.24 272.95 515.88 272.24 514.21 271.39 C 512.54 270.54 511.74 269.63 511.61 268.9 Z M 522.84 230.83 C 522.3 230.81 521.78 231.09 521.5 231.56 L 513.72 244.54 C 513.44 245 513.44 245.58 513.7 246.05 C 513.97 246.52 514.47 246.81 515.01 246.81 L 530.57 246.81 C 531.11 246.81 531.61 246.52 531.88 246.05 C 532.15 245.58 532.14 245 531.86 244.54 L 524.08 231.56 C 523.82 231.12 523.35 230.85 522.84 230.83 Z M 536.24 231.55 C 536.53 231.55 536.73 231.75 536.73 232.04 C 536.73 232.33 536.53 232.53 536.24 232.53 C 535.95 232.53 535.75 232.33 535.75 232.04 C 535.75 231.75 535.95 231.55 536.24 231.55 Z M 522.79 235.25 L 527.92 243.81 L 517.66 243.81 Z M 567.94 239.53 C 569.17 240.29 570.32 241.07 571.21 241.82 C 572.3 242.73 572.62 243.41 572.75 243.75 C 572.15 243.8 571.03 243.72 569.55 243.42 C 568.92 243.29 568.22 243.12 567.48 242.92 Z M 548.58 245.4 C 543.88 245.4 540.05 249.24 540.05 253.94 C 540.05 258.63 543.88 262.47 548.58 262.47 C 553.27 262.47 557.11 258.63 557.11 253.94 C 557.11 249.24 553.27 245.4 548.58 245.4 Z M 548.58 248.4 C 551.65 248.4 554.11 250.86 554.11 253.94 C 554.11 257.01 551.65 259.47 548.58 259.47 C 545.5 259.47 543.05 257.01 543.05 253.94 C 543.05 250.86 545.5 248.4 548.58 248.4 Z M 519.8 253.14 C 519.4 253.14 519.02 253.3 518.74 253.58 C 518.46 253.86 518.3 254.24 518.3 254.64 L 518.3 267.26 C 518.3 267.66 518.46 268.04 518.74 268.32 C 519.02 268.61 519.4 268.76 519.8 268.76 L 532.24 268.76 C 532.64 268.76 533.02 268.61 533.3 268.32 C 533.59 268.04 533.74 267.66 533.74 267.26 L 533.74 254.64 C 533.74 254.24 533.59 253.86 533.3 253.58 C 533.02 253.3 532.64 253.14 532.24 253.14 Z M 521.3 256.14 L 530.74 256.14 L 530.74 265.76 L 521.3 265.76 Z" fill="#277116" stroke="none" pointer-events="all"/><rect x="523.5" y="280" width="30" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 290px; margin-left: 539px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">S3</div></div></div></foreignObject><text x="539" y="294" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">S3</text></switch></g><path d="M 108.81 238 L 181.07 238" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 188.07 238 L 181.07 240.33 L 181.07 235.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="31" y="200" width="78" height="76" fill="none" stroke="none" pointer-events="all"/><path d="M 82.91 263.62 C 81.76 263.62 80.84 264.35 80.84 265.23 L 80.84 268.33 C 80.84 269.22 81.76 269.94 82.91 269.94 L 100.64 269.94 C 101.79 269.94 102.71 269.22 102.71 268.33 L 102.71 265.23 C 102.71 264.35 101.79 263.62 100.64 263.62 Z M 36.86 260.39 L 103.16 260.39 C 104.71 260.39 105.85 261.51 105.85 262.87 L 105.85 270.63 C 105.85 271.98 104.71 273.1 103.16 273.1 L 36.86 273.1 C 35.3 273.1 34.16 271.98 34.16 270.63 L 34.16 262.87 C 34.16 261.51 35.3 260.39 36.86 260.39 Z M 36.86 257.49 C 33.83 257.49 31.27 259.87 31.27 262.87 L 31.27 270.63 C 31.27 273.62 33.83 276 36.86 276 L 103.16 276 C 106.19 276 108.75 273.62 108.75 270.63 L 108.75 262.87 C 108.75 259.87 106.19 257.49 103.16 257.49 Z M 40.31 209.12 L 99.66 209.12 L 99.66 245.17 L 40.31 245.17 Z M 38.9 206.23 C 38.9 206.23 38.09 206.47 38.09 206.47 C 38.09 206.47 37.94 206.6 37.88 206.65 C 37.82 206.71 37.77 206.77 37.74 206.81 C 37.59 207 37.56 207.11 37.53 207.19 C 37.47 207.36 37.46 207.44 37.45 207.51 C 37.43 207.66 37.42 207.75 37.42 207.87 L 37.42 246.42 C 37.42 246.54 37.43 246.64 37.45 246.78 C 37.46 246.86 37.47 246.93 37.53 247.09 C 37.56 247.17 37.59 247.29 37.74 247.48 C 37.77 247.53 37.82 247.58 37.88 247.64 C 37.94 247.7 38.09 247.82 38.09 247.82 C 38.09 247.83 38.9 248.07 38.9 248.07 L 101.07 248.07 C 101.08 248.07 101.88 247.82 101.88 247.82 C 101.88 247.82 102.04 247.7 102.1 247.64 C 102.16 247.58 102.2 247.53 102.24 247.48 C 102.38 247.29 102.41 247.17 102.45 247.09 C 102.51 246.93 102.51 246.86 102.53 246.78 C 102.55 246.64 102.55 246.54 102.55 246.42 L 102.55 207.87 C 102.55 207.75 102.55 207.66 102.53 207.51 C 102.51 207.44 102.51 207.36 102.45 207.19 C 102.41 207.11 102.38 207 102.24 206.81 C 102.2 206.77 102.16 206.71 102.1 206.65 C 102.04 206.6 101.88 206.47 101.88 206.47 C 101.88 206.47 101.08 206.23 101.07 206.23 Z M 35.47 202.9 L 104.53 202.9 C 105.27 202.9 105.93 203.53 105.93 204.51 L 105.93 249.7 C 105.93 250.68 105.27 251.31 104.53 251.31 L 35.47 251.31 C 34.74 251.31 34.08 250.68 34.08 249.7 L 34.08 204.51 C 34.08 203.53 34.74 202.9 35.47 202.9 Z M 35.47 200 C 33.06 200 31.19 202.09 31.19 204.51 L 31.19 249.7 C 31.19 252.11 33.06 254.21 35.47 254.21 L 104.53 254.21 C 106.94 254.21 108.81 252.11 108.81 249.7 L 108.81 204.51 C 108.81 202.09 106.94 200 104.53 200 Z" fill="#232f3e" stroke="none" pointer-events="all"/><rect x="0" y="276" width="140" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 286px; margin-left: 70px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Python Data Generator</div></div></div></foreignObject><text x="70" y="290" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Python Data Generator</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://desk.draw.io/support/solutions/articles/16000042487" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/msk-serverless-to-iceberg/glue-streaming-data-from-msk-serverless-to-iceberg-table.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="761px" height="320px" viewBox="-0.5 -0.5 761 320" content="&lt;mxfile host=&quot;Electron&quot; modified=&quot;2023-01-25T14:43:38.594Z&quot; agent=&quot;5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.2.2 Chrome/83.0.4103.100 Electron/9.0.3 Safari/537.36&quot; etag=&quot;1BSuIIsjGdSNszx9u8cd&quot; version=&quot;13.2.2&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;KxS7Oj1AeW9TW_wmeesr&quot; name=&quot;Page-1&quot;&gt;7Vpbd6M2EP41ftwc7pdH3zdtspseb5O2Lz4yyFgbQK6Qb/n1lUBgg0jsk2I2TpL4JGgkRtLM982MwB29H23HBCwXt9iHYUdT/G1HH3Q0TVVcjf3jkl0mcR09EwQE+WLQXjBBTzC/U0hXyIdJaSDFOKRoWRZ6OI6hR0syQAjelIfNcViedQkCKAkmHghl6QPy6UJIVcvdd3yFKFiIqR3NzjoikA8WO0kWwMebA5E+7Oh9gjHNrqJtH4bceLldsvtGz/QWCyMwpqfc0N8FClwu1+a1huOx+ufO+Tb7ooptrEG4EjvuPkyYoB/ilS8WTne5NZYYxTS1qNljHzZhX+mYrKfPW1eaWRFU23ZZoMotrqMsqLbtskCtqlcr86vVBR4IpFZJvVKZXzlYIPvoPbyiIYphv8CewoQBAT5iPunjEBMmi3HMrNdb0ChkLZVdbhaIwskSeNyqG8YbJpvjmAr0q1reFobnWhl6lvw62gacaFdgkxhXAcGrZTrlNcN/be+UXU691JlMCSX4EeYL62g6+x1xtPTmKAwrC15DQhEjQjdEAddNMZ8KiFYI55RrZLtAcXCTtga6IlZeN4UPkgX0xXZk8Ao881nh9kAkwDyGOIKU7NgQ0as7glgisqiWaG/2PLXyMLI4oKiegx6I2BAUuvf0YReCQfVseoruox/f//7j+3qeRM797V+Db+svmkSmO4LWgELOotUshvSTUZfPqAR6K4LobrofPEmJlSuuskw17GGvW2EZkw+tkTZymqNaMU/jVMuzuKCarstUU/UaqhXCxqmmSlSTqFWLpeeiUxVjrM81zMFIO+gbIMIUoRQXMSZ851WXml1d6Zl1IJinP1Xn5J6/ATMY3uEECfUzTCmOjkLDY6uCpMyFY7gHyTIzxxxtof8cEQhM8Ip4MKNBjzXrCBGBmNU//pTtFoKIoXM6x2T6COaPoBnkGXmszpFXE+RtRwZeLvs/uEuub27vv23Gs39n3cXXh4H229M/Z8ediG2nwUoMfnOISpMaJMM1zHKb+my4ZVac+oCCqcf+hDhoBjW2Xo5XRQ1+iBqtRdTIhcGYX2nKgG2bV9ti91Uosf3Ssi8kP1XdGSHf57dzyqInMCu8VK40eLIFK8rwkfm1IcOrRtnwriMnirqaTGsgT9QfcPTjhIU+O/KJJg/rOMAxCId7aY8l+tgvKLYfc4M5eVLr/YSU7gRLuGHLboOx3+WnUU7EEHuPPxYozsQjFIbHHMAX+KL5CQwBRevymbXOluLWO46FvdtM2y5HWbdSIVNAAkjFXYdnzIoiwz2iKEsqkqLUtcV+Xs8z6x04OzPRC5AWe8p8cjxVnRs8rmaVOe8oJ4GHmQjsDoaJ+PT8PLpansd0KxDKNL4WUC/Z+rPM/KVlJqALGDdUUTpOJUS1WFHWZyjzIwQtvZ1gZOvmqzJZUwkor20+hDPfSgay1HYykDRPGxnohPr1MwOdPQMFqfWbyD9Srf2r848hAex28jsTTCBhuwphkkiAu7RjqaE4JZsbes2xVGnzWCrnfPFAYJI/Sbt4o5uW9caMLp8Ou1lheemmdhX1qKnNVp+62B+hDLLbKW8cp0ykM9a0L22z5Ufemm2rrN56d4+8ZyvvEdLpBtHFFM9+Mi1JUw+9y0GgtrAwWywsHAk3E/3iQ61t6lflI6ZhyHauexd6vmB7wjupyw+2bjvB1lDdK9VxFdOwNUu3nAqn3ApZzhx65W9ntRJ68+HvLPR6IUph10Qc0KwTom3dMc46VxSQw+3djpE47hRvGccwhgRQ5sxLj8JFgC0KXkuyvmo0E4RZc/8lzYzY+6+66sP/AA==&lt;/diagram&gt;&lt;/mxfile&gt;"><defs><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-945df2-1-5a30b5-1-s-0"><stop offset="0%" style="stop-color:#5A30B5"/><stop offset="100%" style="stop-color:#945DF2"/></linearGradient></defs><g><path d="M 150 0 L 760 0 L 760 319 L 150 319 Z" fill="none" stroke="#232f3e" stroke-miterlimit="10" pointer-events="all"/><path d="M 160.73 6.89 C 158.19 6.87 156.05 9.34 156.28 11.84 C 154.63 12.3 153.57 14.09 153.93 15.76 C 154.17 17.45 155.84 18.8 157.56 18.58 C 161.3 18.56 165.04 18.61 168.79 18.55 C 170.72 18.38 172.05 16.29 171.58 14.45 C 171.34 13.13 170.21 11.98 168.87 11.81 C 168.86 10.3 167.26 8.82 165.79 9.6 C 164.97 10.39 164.82 8.44 164.02 8.13 C 163.14 7.28 161.95 6.83 160.73 6.89 Z M 160.75 7.63 C 162.69 7.51 164.32 9.05 164.91 10.79 C 165.31 11.19 165.67 10.53 165.97 10.34 C 167.25 9.57 168.21 11.17 168.22 12.29 C 168.58 12.77 169.39 12.38 169.77 12.91 C 171.21 13.93 171.31 16.3 169.88 17.37 C 168.9 18.09 167.63 17.75 166.5 17.83 C 163.28 17.82 160.06 17.84 156.84 17.82 C 155.21 17.58 154.19 15.69 154.79 14.17 C 155.07 13.16 156.05 12.67 156.94 12.36 C 157.19 11.62 156.85 10.71 157.33 9.98 C 157.92 8.65 159.25 7.61 160.75 7.63 Z M 150 25 C 150 16.67 150 8.33 150 0 C 158.33 0 166.67 0 175 0 C 175 8.33 175 16.67 175 25 C 166.67 25 158.33 25 150 25 Z" fill="#232f3e" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 578px; height: 1px; padding-top: 7px; margin-left: 182px;"><div style="box-sizing: border-box; font-size: 0; text-align: left; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #232F3E; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">AWS Cloud</div></div></div></foreignObject><text x="182" y="19" fill="#232F3E" font-family="Helvetica" font-size="12px">AWS Cloud</text></switch></g><path d="M 162 170 L 292 170 L 292 300 L 162 300 Z" fill="#e6f2f8" stroke="none" pointer-events="all"/><path d="M 162 170 L 187 170 L 187 195 L 162 195 Z M 174.52 173.21 C 173.4 173.21 172.31 173.62 171.49 174.39 C 170.67 175.11 170.21 176.15 170.2 177.24 L 170.2 179.78 L 167.89 179.78 C 167.8 179.78 167.7 179.82 167.64 179.89 C 167.57 179.95 167.54 180.04 167.54 180.14 L 167.54 191.43 C 167.54 191.63 167.7 191.79 167.89 191.79 L 181.11 191.79 C 181.3 191.79 181.46 191.63 181.46 191.43 L 181.46 180.15 C 181.47 180.06 181.43 179.97 181.36 179.9 C 181.3 179.83 181.21 179.79 181.11 179.79 L 178.81 179.79 L 178.81 177.29 C 178.81 176.21 178.35 175.18 177.56 174.44 C 176.74 173.65 175.65 173.22 174.52 173.21 Z M 174.51 173.93 C 175.46 173.92 176.37 174.28 177.06 174.93 C 177.72 175.54 178.1 176.4 178.1 177.29 L 178.1 179.79 L 170.88 179.79 L 170.89 177.26 C 170.9 176.36 171.28 175.51 171.95 174.91 C 172.65 174.27 173.57 173.92 174.51 173.93 Z M 168.24 180.5 L 180.76 180.5 L 180.75 191.07 L 168.24 191.07 Z M 174.51 182.74 C 173.48 182.73 172.61 183.51 172.51 184.53 C 172.42 185.56 173.13 186.48 174.14 186.66 L 174.14 189.44 L 174.86 189.44 L 174.86 186.66 C 175.79 186.49 176.47 185.67 176.48 184.72 C 176.48 183.63 175.6 182.75 174.51 182.74 Z M 174.39 183.46 C 174.43 183.46 174.47 183.46 174.51 183.46 C 174.84 183.46 175.16 183.59 175.4 183.83 C 175.64 184.07 175.77 184.39 175.76 184.72 C 175.77 185.06 175.64 185.38 175.4 185.61 C 175.16 185.85 174.84 185.98 174.51 185.98 C 174.04 186.03 173.58 185.81 173.32 185.42 C 173.06 185.03 173.04 184.53 173.26 184.12 C 173.49 183.7 173.92 183.45 174.39 183.46 Z" fill="#147eba" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 98px; height: 1px; padding-top: 177px; margin-left: 194px;"><div style="box-sizing: border-box; font-size: 0; text-align: left; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #147EBA; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Private subnet</div></div></div></foreignObject><text x="194" y="189" fill="#147EBA" font-family="Helvetica" font-size="12px">Private subnet</text></switch></g><path d="M 189 200 L 267 200 L 267 278 L 189 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 226.69 207.8 C 224.83 207.8 223.13 208.86 222.29 210.52 C 221.46 212.19 221.64 214.18 222.75 215.68 L 210.63 234.01 C 210.06 233.79 209.44 233.68 208.82 233.68 C 206.26 233.68 204.12 235.64 203.91 238.19 C 203.69 240.75 205.47 243.04 207.99 243.47 C 210.52 243.9 212.95 242.33 213.59 239.84 L 239.79 239.84 C 240.44 242.33 242.87 243.9 245.4 243.47 C 247.92 243.04 249.7 240.75 249.48 238.19 C 249.26 235.64 247.13 233.68 244.56 233.68 C 243.95 233.68 243.34 233.79 242.77 234.01 L 230.66 215.65 C 231.76 214.15 231.93 212.16 231.09 210.51 C 230.25 208.85 228.55 207.8 226.69 207.8 Z M 226.69 210.26 C 227.69 210.26 228.59 210.87 228.97 211.79 C 229.35 212.71 229.14 213.77 228.44 214.47 C 227.73 215.18 226.67 215.39 225.75 215.01 C 224.83 214.63 224.23 213.73 224.23 212.73 C 224.23 211.37 225.33 210.26 226.69 210.26 Z M 228.74 217.2 L 240.77 235.46 C 240.29 236.03 239.96 236.7 239.79 237.43 L 239.79 237.38 L 213.59 237.38 C 213.41 236.68 213.07 236.03 212.61 235.47 L 224.68 217.22 C 225.31 217.51 226 217.66 226.69 217.66 C 227.4 217.66 228.1 217.51 228.74 217.2 Z M 223.22 223.81 L 223.22 234.91 L 225.46 234.91 L 225.46 229.55 L 228.6 234.91 L 231.27 234.91 L 227.46 229.01 L 230.97 223.81 L 228.49 223.81 L 225.46 228.75 L 225.46 223.81 Z M 208.72 236.15 C 209.39 236.12 210.05 236.37 210.53 236.83 C 211.02 237.3 211.29 237.94 211.29 238.61 C 211.29 239.97 210.18 241.08 208.82 241.08 C 207.46 241.1 206.34 240.02 206.31 238.66 C 206.28 237.3 207.36 236.17 208.72 236.15 Z M 244.47 236.15 C 245.14 236.12 245.79 236.37 246.27 236.83 C 246.76 237.3 247.03 237.94 247.03 238.61 C 247.03 239.27 246.77 239.89 246.31 240.35 C 245.85 240.82 245.22 241.08 244.56 241.08 C 243.2 241.1 242.08 240.02 242.05 238.66 C 242.02 237.3 243.1 236.17 244.47 236.15 Z M 214.95 245.19 C 212.79 245.27 210.72 246.03 209.02 247.36 L 207.32 248.79 C 204.58 250.91 200.76 250.91 198.01 248.79 L 197.2 248.1 L 197.2 251.27 C 200.87 253.56 205.58 253.32 208.98 250.65 L 210.68 249.24 C 213.42 247.11 217.24 247.11 219.98 249.24 L 221.68 250.65 C 225.38 253.56 230.59 253.56 234.3 250.65 L 236.02 249.24 C 238.76 247.11 242.58 247.11 245.32 249.24 L 247.02 250.65 C 248.79 252.08 251 252.84 253.28 252.81 C 255.23 252.84 257.16 252.28 258.8 251.22 L 258.8 248.1 L 258.8 248.09 L 257.95 248.79 C 255.21 250.91 251.39 250.91 248.64 248.79 L 246.96 247.36 C 243.25 244.47 238.04 244.47 234.33 247.36 L 232.63 248.79 C 229.89 250.91 226.07 250.91 223.33 248.79 L 221.64 247.36 C 219.74 245.87 217.36 245.1 214.95 245.19 Z M 214.99 254.01 C 212.82 254.08 210.73 254.84 209.02 256.19 L 207.32 257.62 C 204.57 259.72 200.76 259.72 198.01 257.62 L 197.2 256.94 L 197.2 260.11 C 200.87 262.39 205.57 262.14 208.98 259.49 L 210.68 258.05 C 213.42 255.94 217.24 255.94 219.98 258.05 L 221.67 259.49 C 225.38 262.39 230.59 262.39 234.3 259.49 L 236.02 258.05 C 238.76 255.94 242.58 255.94 245.32 258.05 L 247.02 259.49 C 248.79 260.92 251 261.69 253.28 261.66 C 255.24 261.68 257.16 261.12 258.8 260.06 L 258.8 256.94 L 258.8 256.9 L 257.95 257.62 C 255.2 259.72 251.39 259.72 248.64 257.62 L 246.96 256.19 C 243.25 253.27 238.04 253.27 234.33 256.19 L 232.63 257.62 C 229.89 259.72 226.07 259.72 223.33 257.62 L 221.64 256.19 C 219.75 254.7 217.39 253.92 214.99 254.01 Z M 214.99 261.82 C 212.82 261.89 210.73 262.66 209.02 264 L 207.32 265.43 C 204.57 267.54 200.76 267.54 198.01 265.43 L 197.2 264.74 L 197.2 267.91 C 200.87 270.2 205.58 269.95 208.98 267.29 L 210.68 265.87 C 213.42 263.74 217.25 263.74 219.98 265.87 L 221.67 267.29 C 225.37 270.2 230.59 270.2 234.3 267.29 L 236.02 265.87 C 238.75 263.74 242.58 263.74 245.32 265.87 L 247.02 267.29 C 248.79 268.73 251 269.5 253.28 269.47 C 255.23 269.5 257.16 268.95 258.8 267.88 L 258.8 264.74 L 258.8 264.73 L 257.95 265.43 C 255.2 267.54 251.39 267.54 248.64 265.43 L 246.96 264 C 243.25 261.09 238.04 261.09 234.33 264 L 232.63 265.43 C 229.89 267.54 226.07 267.54 223.33 265.43 L 221.64 264 C 219.75 262.51 217.39 261.74 214.99 261.82 Z" fill="#ffffff" stroke="none" pointer-events="all"/><rect x="502" y="60" width="72" height="78" fill="none" stroke="none" pointer-events="all"/><path d="M 507.02 60 C 504.32 60.01 502.13 62.2 502.12 64.9 L 502.12 133.13 C 502.12 135.82 504.3 138 506.99 138 L 511.48 138 L 511.48 134.88 L 507.02 134.88 C 506.55 134.88 506.09 134.69 505.76 134.36 C 505.43 134.03 505.24 133.57 505.24 133.1 L 505.24 64.88 C 505.25 63.91 506.03 63.13 507 63.12 L 511.48 63.12 L 511.48 60 Z M 516.16 60 C 515.3 60 514.6 60.7 514.6 61.56 L 514.6 136.44 C 514.6 137.3 515.3 138 516.16 138 L 572.32 138 C 573.18 138 573.88 137.3 573.88 136.44 L 573.88 61.56 C 573.88 60.7 573.18 60 572.32 60 Z M 517.72 63.12 L 570.76 63.12 L 570.76 134.88 L 517.72 134.88 Z M 523.96 67.8 L 523.96 70.92 L 566.08 70.92 L 566.08 67.8 Z M 523.96 74.04 L 523.96 77.16 L 566.08 77.16 L 566.08 74.04 Z M 544.32 80.51 C 541.42 80.51 527.16 80.75 527.16 85.68 C 527.16 86.3 527.39 86.85 527.8 87.33 L 539.5 110.53 L 539.5 115.91 C 539.5 116.44 539.77 116.93 540.22 117.22 C 541.41 118.01 542.81 118.45 544.24 118.5 C 545.67 118.47 547.06 118.01 548.23 117.19 C 548.66 116.9 548.92 116.42 548.92 115.91 L 548.92 110.47 L 560.48 87.66 C 561.09 87.1 561.48 86.46 561.48 85.68 C 561.48 80.76 547.22 80.51 544.32 80.51 Z M 544.32 83.63 C 551.9 83.63 556.58 84.85 558 85.68 L 557.98 85.68 C 556.58 86.52 551.87 87.74 544.32 87.74 C 536.75 87.74 532.06 86.52 530.64 85.68 C 532.06 84.85 536.74 83.63 544.32 83.63 Z M 532.46 89.64 C 536.85 90.76 542.62 90.86 544.32 90.86 C 545.99 90.86 551.6 90.76 555.96 89.69 L 545.97 109.39 C 545.86 109.61 545.8 109.85 545.8 110.09 L 545.8 114.97 C 544.83 115.51 543.65 115.51 542.68 114.97 L 542.68 110.15 C 542.68 109.91 542.62 109.67 542.51 109.45 Z M 523.96 122.4 L 523.96 125.52 L 566.08 125.52 L 566.08 122.4 Z M 523.96 128.64 L 523.96 131.76 L 566.08 131.76 L 566.08 128.64 Z" fill="#5a30b5" stroke="none" pointer-events="all"/><rect x="484" y="138" width="110" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 148px; margin-left: 539px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Glue Data Catalog</div></div></div></foreignObject><text x="539" y="152" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Glue Data Catalog</text></switch></g><path d="M 267 239 L 338.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 345.88 239 L 338.88 241.33 L 338.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 701 200 L 701 99 L 582 99" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 575 99 L 582 96.67 L 582 101.33 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 657 200 L 735 200 L 735 278 L 657 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 694.81 207.83 C 682.99 207.83 672.76 214.71 667.92 224.7 L 670.09 225.76 C 674.52 216.59 683.92 210.26 694.81 210.26 C 709.98 210.26 722.26 222.54 722.26 237.7 C 722.26 243.14 720.69 248.18 717.97 252.44 L 713.25 247.67 C 714.69 244.87 715.51 241.71 715.51 238.34 C 715.51 227.04 706.31 217.84 694.99 217.84 C 686.09 217.84 678.51 223.54 675.68 231.48 L 667.31 231.44 L 667.3 233.87 L 674.98 233.91 C 674.71 235.13 674.54 236.39 674.5 237.68 L 664.81 237.62 L 664.8 240.05 L 674.57 240.11 C 674.68 241.42 674.92 242.68 675.26 243.89 L 667.37 243.97 L 667.4 246.4 L 676.11 246.32 C 679.22 253.68 686.52 258.86 694.99 258.86 C 698.67 258.86 702.12 257.87 705.1 256.18 L 709.65 260.8 C 705.38 263.55 700.28 265.15 694.81 265.15 C 685.25 265.15 676.84 260.27 671.93 252.88 L 669.92 254.23 C 675.28 262.28 684.44 267.58 694.81 267.58 C 700.94 267.58 706.64 265.73 711.38 262.55 L 716.73 267.96 L 716.73 267.97 C 717.87 269.09 719.27 269.87 720.8 270.02 C 722.33 270.17 723.94 269.66 725.29 268.46 L 725.33 268.42 L 725.37 268.38 C 726.58 267.12 727.2 265.6 727.16 264.09 C 727.13 262.58 726.46 261.13 725.36 259.92 L 725.35 259.91 L 719.71 254.2 C 722.85 249.47 724.69 243.8 724.69 237.7 C 724.69 221.22 711.3 207.83 694.81 207.83 Z M 694.99 220.27 C 705 220.27 713.08 228.35 713.08 238.34 C 713.08 248.35 705 256.43 694.99 256.43 C 685 256.43 676.92 248.35 676.92 238.34 C 676.92 228.35 685 220.27 694.99 220.27 Z M 694.86 226.58 C 692.16 226.58 689.62 226.96 687.67 227.56 C 686.69 227.86 685.88 228.21 685.2 228.66 C 684.53 229.12 683.87 229.76 683.87 230.75 L 683.87 230.83 L 685.9 247.08 L 686.04 247.31 C 686.85 248.62 688.28 249.13 689.77 249.49 C 691.27 249.84 692.92 249.96 694.37 249.96 L 694.38 249.96 C 697.54 249.96 700.57 249.5 702.4 247.86 L 702.73 247.57 L 702.79 247.13 L 703.53 241.88 C 703.72 241.92 703.92 241.95 704.11 241.97 C 704.5 242.01 704.86 242.02 705.28 241.94 C 705.49 241.9 705.73 241.83 706.01 241.64 C 706.29 241.44 706.58 241 706.61 240.56 L 706.61 240.55 L 706.61 240.54 C 706.64 239.89 706.38 239.49 706.14 239.15 C 705.9 238.82 705.62 238.53 705.33 238.25 C 704.94 237.89 704.54 237.58 704.17 237.28 L 705.06 230.95 L 705.06 230.86 C 705.06 229.85 704.41 229.25 703.76 228.8 C 703.12 228.34 702.32 227.97 701.41 227.65 C 699.56 227.02 697.22 226.58 694.86 226.58 Z M 694.86 229.01 C 696.91 229.01 699.06 229.41 700.61 229.95 C 701.21 230.16 701.71 230.39 702.08 230.61 C 700.73 231.3 698.39 231.84 694.31 231.84 L 694.3 231.84 L 694.28 231.85 C 691.06 231.9 688.36 231.56 686.76 230.56 C 687.14 230.34 687.69 230.1 688.38 229.89 C 690.03 229.38 692.38 229.01 694.86 229.01 Z M 702.29 233.17 L 701.52 238.7 C 699.68 238.02 697.98 237.19 696.1 236.25 C 695.84 235.52 695.15 235.04 694.37 235.04 C 693.36 235.04 692.53 235.86 692.53 236.88 C 692.53 237.89 693.36 238.71 694.37 238.71 C 694.66 238.71 694.94 238.65 695.2 238.52 C 697.17 239.51 699.04 240.43 701.17 241.18 L 700.47 246.19 C 699.49 246.87 697.05 247.53 694.37 247.53 C 693.06 247.53 691.55 247.41 690.34 247.12 C 689.18 246.85 688.46 246.38 688.24 246.1 L 686.62 233.26 C 688.79 234.19 691.49 234.32 694.31 234.27 C 697.93 234.27 700.48 233.89 702.29 233.17 Z M 711.98 249.83 L 723.56 261.56 C 724.35 262.42 724.71 263.31 724.73 264.14 C 724.75 264.97 724.45 265.8 723.64 266.66 C 722.74 267.45 721.89 267.68 721.04 267.6 C 720.17 267.52 719.27 267.06 718.44 266.24 L 707.18 254.83 C 709.05 253.45 710.67 251.76 711.98 249.83 Z" fill="#ffffff" stroke="none" pointer-events="all"/><path d="M 425 239 L 496.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 503.88 239 L 496.88 241.33 L 496.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 386 200 L 386 99 L 494 99" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 501 99 L 494 101.33 L 494 96.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 347 200 L 425 200 L 425 278 L 347 278 Z" fill="url(#mx-gradient-945df2-1-5a30b5-1-s-0)" stroke="none" pointer-events="all"/><path d="M 385.92 207.8 C 385.25 207.8 384.71 208.34 384.71 209.01 L 384.71 214.04 C 384.71 214.71 385.25 215.25 385.92 215.25 L 390.93 215.25 C 391.6 215.25 392.14 214.71 392.14 214.04 L 392.14 209.01 C 392.14 208.34 391.6 207.8 390.93 207.8 Z M 387.14 210.23 L 389.71 210.23 L 389.71 212.82 L 387.14 212.82 Z M 375.94 214.01 C 375.27 214.01 374.73 214.55 374.73 215.22 L 374.73 220.25 C 374.73 220.92 375.27 221.46 375.94 221.46 L 380.95 221.46 C 381.62 221.46 382.16 220.92 382.16 220.25 L 382.16 215.22 C 382.16 214.55 381.62 214.01 380.95 214.01 Z M 377.16 216.44 L 379.73 216.44 L 379.73 219.03 L 377.16 219.03 Z M 388.5 219.01 C 387.83 219.01 387.28 219.56 387.28 220.23 L 387.28 225.25 C 387.28 225.92 387.83 226.46 388.5 226.46 L 393.5 226.46 C 394.17 226.46 394.71 225.92 394.71 225.25 L 394.71 220.23 C 394.71 219.56 394.17 219.01 393.5 219.01 Z M 389.71 221.44 L 392.29 221.44 L 392.29 224.03 L 389.71 224.03 Z M 384.6 222.76 C 380.82 222.76 376.47 223.07 372.99 223.63 C 371.25 223.91 369.75 224.24 368.58 224.66 C 368 224.86 367.49 225.08 367.06 225.41 C 366.62 225.73 366.12 226.27 366.12 227.05 C 366.12 227.41 366.23 227.75 366.39 228.03 L 366.43 228.12 C 366.46 228.21 366.51 228.3 366.56 228.38 L 381.05 250.5 L 381.01 253.89 C 381 254.36 381.27 254.8 381.7 255.01 C 382.77 255.51 384.2 255.75 385.75 255.8 C 387.28 255.84 388.9 255.67 390.27 255 C 390.69 254.78 390.95 254.35 390.95 253.89 L 390.92 250.55 L 405.3 228.39 C 405.34 228.33 405.37 228.26 405.4 228.19 L 405.41 228.17 C 405.68 227.79 405.88 227.33 405.88 226.81 C 405.88 225.94 405.27 225.41 404.77 225.1 C 404.26 224.8 403.69 224.59 403 224.39 C 401.64 224 399.85 223.7 397.82 223.46 C 397.61 223.44 397.39 223.41 397.16 223.39 L 396.89 225.8 C 397.1 225.83 397.33 225.85 397.53 225.88 C 399.5 226.11 401.2 226.4 402.34 226.72 C 402.69 226.83 402.9 226.92 403.1 227.02 C 402.9 227.13 402.63 227.26 402.29 227.4 C 401.24 227.78 399.6 228.12 397.74 228.37 C 394.01 228.86 389.31 229.03 385.78 229.06 C 383.17 229.06 378.58 228.87 374.71 228.39 C 372.78 228.14 371.02 227.82 369.84 227.44 C 369.49 227.34 369.21 227.22 368.99 227.11 C 369.1 227.07 369.22 227.01 369.4 226.94 C 370.31 226.61 371.73 226.29 373.38 226.03 C 376.68 225.5 380.96 225.19 384.6 225.19 L 384.61 225.19 L 384.6 222.76 Z M 370.64 230.16 C 371.78 230.43 373.05 230.63 374.41 230.8 C 378.43 231.3 383.07 231.49 385.78 231.49 C 385.79 231.49 385.79 231.49 385.79 231.49 C 389.38 231.46 394.13 231.3 398.06 230.78 C 399.18 230.63 400.24 230.45 401.2 230.23 L 388.67 249.53 C 388.55 249.73 388.48 249.96 388.48 250.21 L 388.52 252.93 C 387.73 253.17 386.83 253.4 385.81 253.37 C 384.82 253.34 384.08 253.13 383.44 252.94 L 383.47 250.16 C 383.48 249.91 383.42 249.68 383.28 249.48 Z M 382.28 257.61 L 382.28 261.39 L 378.31 261.39 C 377.17 261.4 376.66 262.82 377.54 263.55 L 385.18 269.83 C 385.63 270.2 386.28 270.19 386.73 269.82 L 394.27 263.54 C 395.13 262.81 394.62 261.39 393.49 261.39 L 389.63 261.39 L 389.63 257.61 L 387.2 257.61 L 387.2 262.6 C 387.2 263.27 387.75 263.82 388.42 263.82 L 390.13 263.82 L 385.95 267.31 L 381.71 263.82 L 383.49 263.82 C 384.16 263.82 384.71 263.27 384.71 262.6 L 384.71 257.61 Z" fill="#ffffff" stroke="none" pointer-events="all"/><rect x="178" y="278" width="100" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 228px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">MSK Serverless</div></div></div></foreignObject><text x="228" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">MSK Serverless</text></switch></g><rect x="336" y="278" width="100" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 386px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Glue Streaming</div></div></div></foreignObject><text x="386" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Glue Streaming</text></switch></g><rect x="671" y="278" width="50" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 288px; margin-left: 696px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Athena</div></div></div></foreignObject><text x="696" y="292" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Athena</text></switch></g><path d="M 576 239.1 L 616 239.1 L 616 239 L 647.88 239" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 654.88 239 L 647.88 241.33 L 647.88 236.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="501" y="200" width="75" height="78" fill="none" stroke="none" pointer-events="all"/><path d="M 538.02 200.07 C 529.81 200.07 521.26 201 514.44 202.84 C 511.02 203.76 508.04 204.89 505.7 206.32 C 503.37 207.75 501.61 209.52 501.05 211.8 C 501.01 211.98 501 212.16 501.03 212.35 L 508.65 269.35 C 508.65 269.37 508.66 269.39 508.66 269.4 C 509.03 271.56 510.78 273.01 512.85 274.06 C 514.91 275.12 517.47 275.86 520.17 276.42 C 525.56 277.56 531.46 277.93 534.59 277.93 C 541.45 277.93 547.81 277.42 552.87 276.31 C 557.94 275.21 561.85 273.67 563.59 270.73 C 563.69 270.56 563.76 270.37 563.79 270.16 L 567.08 245.92 C 567.73 246.08 568.36 246.23 568.94 246.36 C 570.64 246.71 572.04 246.9 573.28 246.74 C 573.9 246.66 574.53 246.49 575.09 246.04 C 575.65 245.58 576 244.77 576 244.06 C 576 242.12 574.66 240.79 573.14 239.52 C 571.77 238.37 570.08 237.29 568.38 236.3 L 571.65 212.22 C 571.68 212 571.66 211.78 571.6 211.58 C 570.85 209.17 568.79 207.41 566.22 205.99 C 563.66 204.58 560.48 203.48 557.11 202.61 C 550.37 200.86 542.85 200.07 538.02 200.07 Z M 538.02 203.08 C 542.51 203.08 549.93 203.86 556.36 205.51 C 559.57 206.34 562.55 207.4 564.77 208.62 C 566.82 209.76 568.08 211.02 568.57 212.19 C 568.17 213.35 567.03 214.51 565.12 215.57 C 562.97 216.78 559.96 217.78 556.63 218.54 C 549.97 220.07 542.02 220.66 536.68 220.66 C 531.12 220.66 523.02 220.04 516.25 218.52 C 512.87 217.76 509.81 216.77 507.62 215.58 C 505.66 214.52 504.5 213.36 504.09 212.24 C 504.46 211.15 505.46 209.99 507.27 208.88 C 509.23 207.68 511.98 206.61 515.21 205.74 C 521.68 204 530.04 203.08 538.02 203.08 Z M 504.72 217.32 C 505.18 217.64 505.68 217.94 506.2 218.22 C 508.78 219.62 512.05 220.65 515.59 221.45 C 522.69 223.04 530.9 223.66 536.68 223.66 C 542.24 223.66 550.3 223.07 557.3 221.47 C 560.8 220.67 564.03 219.62 566.59 218.19 C 567.05 217.93 567.5 217.66 567.92 217.37 L 564.57 242.1 C 562.32 241.42 559.8 240.58 557.15 239.66 C 553.96 238.54 545.99 234.99 539.74 232.02 C 539.73 230.12 538.14 228.55 536.24 228.55 C 534.33 228.55 532.75 230.13 532.75 232.04 C 532.75 233.95 534.33 235.53 536.24 235.53 C 537.07 235.53 537.84 235.23 538.45 234.73 C 544.75 237.73 552.59 241.24 556.15 242.5 C 556.15 242.5 556.15 242.5 556.16 242.5 C 559.02 243.49 561.72 244.39 564.16 245.11 L 560.87 269.3 C 559.93 270.73 556.87 272.37 552.23 273.38 C 547.49 274.41 541.3 274.92 534.58 274.92 C 531.7 274.92 525.87 274.55 520.78 273.49 C 518.24 272.95 515.88 272.24 514.21 271.39 C 512.54 270.54 511.74 269.63 511.61 268.9 Z M 522.84 230.83 C 522.3 230.81 521.78 231.09 521.5 231.56 L 513.72 244.54 C 513.44 245 513.44 245.58 513.7 246.05 C 513.97 246.52 514.47 246.81 515.01 246.81 L 530.57 246.81 C 531.11 246.81 531.61 246.52 531.88 246.05 C 532.15 245.58 532.14 245 531.86 244.54 L 524.08 231.56 C 523.82 231.12 523.35 230.85 522.84 230.83 Z M 536.24 231.55 C 536.53 231.55 536.73 231.75 536.73 232.04 C 536.73 232.33 536.53 232.53 536.24 232.53 C 535.95 232.53 535.75 232.33 535.75 232.04 C 535.75 231.75 535.95 231.55 536.24 231.55 Z M 522.79 235.25 L 527.92 243.81 L 517.66 243.81 Z M 567.94 239.53 C 569.17 240.29 570.32 241.07 571.21 241.82 C 572.3 242.73 572.62 243.41 572.75 243.75 C 572.15 243.8 571.03 243.72 569.55 243.42 C 568.92 243.29 568.22 243.12 567.48 242.92 Z M 548.58 245.4 C 543.88 245.4 540.05 249.24 540.05 253.94 C 540.05 258.63 543.88 262.47 548.58 262.47 C 553.27 262.47 557.11 258.63 557.11 253.94 C 557.11 249.24 553.27 245.4 548.58 245.4 Z M 548.58 248.4 C 551.65 248.4 554.11 250.86 554.11 253.94 C 554.11 257.01 551.65 259.47 548.58 259.47 C 545.5 259.47 543.05 257.01 543.05 253.94 C 543.05 250.86 545.5 248.4 548.58 248.4 Z M 519.8 253.14 C 519.4 253.14 519.02 253.3 518.74 253.58 C 518.46 253.86 518.3 254.24 518.3 254.64 L 518.3 267.26 C 518.3 267.66 518.46 268.04 518.74 268.32 C 519.02 268.61 519.4 268.76 519.8 268.76 L 532.24 268.76 C 532.64 268.76 533.02 268.61 533.3 268.32 C 533.59 268.04 533.74 267.66 533.74 267.26 L 533.74 254.64 C 533.74 254.24 533.59 253.86 533.3 253.58 C 533.02 253.3 532.64 253.14 532.24 253.14 Z M 521.3 256.14 L 530.74 256.14 L 530.74 265.76 L 521.3 265.76 Z" fill="#277116" stroke="none" pointer-events="all"/><rect x="523.5" y="280" width="30" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 290px; margin-left: 539px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">S3</div></div></div></foreignObject><text x="539" y="294" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">S3</text></switch></g><path d="M 108.81 238 L 181.07 238" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 188.07 238 L 181.07 240.33 L 181.07 235.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="31" y="200" width="78" height="76" fill="none" stroke="none" pointer-events="all"/><path d="M 82.91 263.62 C 81.76 263.62 80.84 264.35 80.84 265.23 L 80.84 268.33 C 80.84 269.22 81.76 269.94 82.91 269.94 L 100.64 269.94 C 101.79 269.94 102.71 269.22 102.71 268.33 L 102.71 265.23 C 102.71 264.35 101.79 263.62 100.64 263.62 Z M 36.86 260.39 L 103.16 260.39 C 104.71 260.39 105.85 261.51 105.85 262.87 L 105.85 270.63 C 105.85 271.98 104.71 273.1 103.16 273.1 L 36.86 273.1 C 35.3 273.1 34.16 271.98 34.16 270.63 L 34.16 262.87 C 34.16 261.51 35.3 260.39 36.86 260.39 Z M 36.86 257.49 C 33.83 257.49 31.27 259.87 31.27 262.87 L 31.27 270.63 C 31.27 273.62 33.83 276 36.86 276 L 103.16 276 C 106.19 276 108.75 273.62 108.75 270.63 L 108.75 262.87 C 108.75 259.87 106.19 257.49 103.16 257.49 Z M 40.31 209.12 L 99.66 209.12 L 99.66 245.17 L 40.31 245.17 Z M 38.9 206.23 C 38.9 206.23 38.09 206.47 38.09 206.47 C 38.09 206.47 37.94 206.6 37.88 206.65 C 37.82 206.71 37.77 206.77 37.74 206.81 C 37.59 207 37.56 207.11 37.53 207.19 C 37.47 207.36 37.46 207.44 37.45 207.51 C 37.43 207.66 37.42 207.75 37.42 207.87 L 37.42 246.42 C 37.42 246.54 37.43 246.64 37.45 246.78 C 37.46 246.86 37.47 246.93 37.53 247.09 C 37.56 247.17 37.59 247.29 37.74 247.48 C 37.77 247.53 37.82 247.58 37.88 247.64 C 37.94 247.7 38.09 247.82 38.09 247.82 C 38.09 247.83 38.9 248.07 38.9 248.07 L 101.07 248.07 C 101.08 248.07 101.88 247.82 101.88 247.82 C 101.88 247.82 102.04 247.7 102.1 247.64 C 102.16 247.58 102.2 247.53 102.24 247.48 C 102.38 247.29 102.41 247.17 102.45 247.09 C 102.51 246.93 102.51 246.86 102.53 246.78 C 102.55 246.64 102.55 246.54 102.55 246.42 L 102.55 207.87 C 102.55 207.75 102.55 207.66 102.53 207.51 C 102.51 207.44 102.51 207.36 102.45 207.19 C 102.41 207.11 102.38 207 102.24 206.81 C 102.2 206.77 102.16 206.71 102.1 206.65 C 102.04 206.6 101.88 206.47 101.88 206.47 C 101.88 206.47 101.08 206.23 101.07 206.23 Z M 35.47 202.9 L 104.53 202.9 C 105.27 202.9 105.93 203.53 105.93 204.51 L 105.93 249.7 C 105.93 250.68 105.27 251.31 104.53 251.31 L 35.47 251.31 C 34.74 251.31 34.08 250.68 34.08 249.7 L 34.08 204.51 C 34.08 203.53 34.74 202.9 35.47 202.9 Z M 35.47 200 C 33.06 200 31.19 202.09 31.19 204.51 L 31.19 249.7 C 31.19 252.11 33.06 254.21 35.47 254.21 L 104.53 254.21 C 106.94 254.21 108.81 252.11 108.81 249.7 L 108.81 204.51 C 108.81 202.09 106.94 200 104.53 200 Z" fill="#232f3e" stroke="none" pointer-events="all"/><rect x="0" y="276" width="140" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 286px; margin-left: 70px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Python Data Generator</div></div></div></foreignObject><text x="70" y="290" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Python Data Generator</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://desk.draw.io/support/solutions/articles/16000042487" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------