├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── app.py ├── assets ├── iceberg-data-level-01.png ├── iceberg-data-level-02.png ├── iceberg-data-level-03.png └── iceberg-table.png ├── cdk.json ├── cdk_stacks ├── __init__.py ├── glue_job_role.py ├── glue_stream_data_schema.py ├── glue_streaming_job.py ├── kds.py ├── lakeformation_permissions.py └── s3.py ├── glue-streaming-data-to-iceberg-table.svg ├── requirements-dev.txt ├── requirements.txt ├── source.bat └── src ├── main └── python │ ├── spark_iceberg_writes_with_dataframe.py │ ├── spark_iceberg_writes_with_sql_insert_overwrite.py │ └── spark_iceberg_writes_with_sql_merge_into.py └── utils └── gen_fake_kinesis_stream_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | Untitled*.ipynb 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | .DS_Store 108 | .idea/ 109 | bin/ 110 | lib64 111 | pyvenv.cfg 112 | *.bak 113 | share/ 114 | cdk.out/ 115 | cdk.context.json* 116 | zap/ 117 | 118 | */.gitignore 119 | */setup.py 120 | */source.bat 121 | 122 | */*/.gitignore 123 | */*/setup.py 124 | */*/source.bat 125 | 126 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # AWS Glue Streaming ETL Job with Apace Iceberg CDK Python project! 3 | 4 | ![glue-streaming-data-to-iceberg-table](./glue-streaming-data-to-iceberg-table.svg) 5 | 6 | In this project, we create a streaming ETL job in AWS Glue to integrate Iceberg with a streaming use case and create an in-place updatable data lake on Amazon S3. 7 | 8 | After ingested to Amazon S3, you can query the data with [Amazon Athena](http://aws.amazon.com/athena). 9 | 10 | This project can be deployed with [AWS CDK Python](https://docs.aws.amazon.com/cdk/api/v2/). 11 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 12 | 13 | This project is set up like a standard Python project. The initialization 14 | process also creates a virtualenv within this project, stored under the `.venv` 15 | directory. To create the virtualenv it assumes that there is a `python3` 16 | (or `python` for Windows) executable in your path with access to the `venv` 17 | package. If for any reason the automatic creation of the virtualenv fails, 18 | you can create the virtualenv manually. 19 | 20 | To manually create a virtualenv on MacOS and Linux: 21 | 22 | ``` 23 | $ python3 -m venv .venv 24 | ``` 25 | 26 | After the init process completes and the virtualenv is created, you can use the following 27 | step to activate your virtualenv. 28 | 29 | ``` 30 | $ source .venv/bin/activate 31 | ``` 32 | 33 | If you are a Windows platform, you would activate the virtualenv like this: 34 | 35 | ``` 36 | % .venv\Scripts\activate.bat 37 | ``` 38 | 39 | Once the virtualenv is activated, you can install the required dependencies. 40 | 41 | ``` 42 | (.venv) $ pip install -r requirements.txt 43 | ``` 44 | 45 | In case of `AWS Glue 3.0`, before synthesizing the CloudFormation, **you first set up Apache Iceberg connector for AWS Glue to use Apache Iceber with AWS Glue jobs.** (For more information, see [References](#references) (2)) 46 | 47 | Then you should set approperly the cdk context configuration file, `cdk.context.json`. 48 | 49 | For example: 50 |
 51 | {
 52 |   "kinesis_stream_name": "iceberg-demo-stream",
 53 |   "glue_assets_s3_bucket_name": "aws-glue-assets-123456789012-atq4q5u",
 54 |   "glue_job_script_file_name": "spark_iceberg_writes_with_dataframe.py",
 55 |   "glue_job_name": "streaming_data_from_kds_into_iceberg_table",
 56 |   "glue_job_input_arguments": {
 57 |     "--catalog": "job_catalog",
 58 |     "--database_name": "iceberg_demo_db",
 59 |     "--table_name": "iceberg_demo_table",
 60 |     "--primary_key": "name",
 61 |     "--kinesis_table_name": "iceberg_demo_kinesis_stream_table",
 62 |     "--starting_position_of_kinesis_iterator": "LATEST",
 63 |     "--iceberg_s3_path": "s3://glue-iceberg-demo-atq4q5u/iceberg_demo_db",
 64 |     "--lock_table_name": "iceberg_lock",
 65 |     "--aws_region": "us-east-1",
 66 |     "--window_size": "100 seconds",
 67 |     "--extra-jars": "s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar",
 68 |     "--user-jars-first": "true"
 69 |   },
 70 |   "glue_connections_name": "iceberg-connection",
 71 |   "glue_kinesis_table": {
 72 |     "database_name": "iceberg_demo_db",
 73 |     "table_name": "iceberg_demo_kinesis_stream_table",
 74 |     "columns": [
 75 |       {
 76 |         "name": "name",
 77 |         "type": "string"
 78 |       },
 79 |       {
 80 |         "name": "age",
 81 |         "type": "int"
 82 |       },
 83 |       {
 84 |         "name": "m_time",
 85 |         "type": "string"
 86 |       }
 87 |     ]
 88 |   }
 89 | }
 90 | 
91 | 92 | :information_source: `--primary_key` option should be set by Iceberg table's primary column name. 93 | 94 | :warning: **You should create a S3 bucket for a glue job script and upload the glue job script file into the s3 bucket.** 95 | 96 | At this point you can now synthesize the CloudFormation template for this code. 97 | 98 |
 99 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
100 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
101 | (.venv) $ cdk synth --all
102 | 
103 | 104 | To add additional dependencies, for example other CDK libraries, just add 105 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 106 | command. 107 | 108 | ## Run Test 109 | 110 | 1. Set up **Apache Iceberg connector for AWS Glue** to use Apache Iceberg with AWS Glue jobs. 111 | 2. Create a S3 bucket for Apache Iceberg table 112 |
113 |    (.venv) $ cdk deploy IcebergS3Path
114 |    
115 | 3. Create a Kinesis data stream 116 |
117 |    (.venv) $ cdk deploy KinesisStreamAsGlueStreamingJobDataSource
118 |    
119 | 4. Define a schema for the streaming data 120 |
121 |    (.venv) $ cdk deploy GlueSchemaOnKinesisStream
122 |    
123 | 124 | Running `cdk deploy GlueSchemaOnKinesisStream` command is like that we create a schema manually using the AWS Glue Data Catalog as the following steps: 125 | 126 | (1) On the AWS Glue console, choose **Data Catalog**.
127 | (2) Choose **Databases**, and click **Add database**.
128 | (3) Create a database with the name `iceberg_demo_db`.
129 | (4) On the **Data Catalog** menu, Choose **Tables**, and click **Add Table**.
130 | (5) For the table name, enter `iceberg_demo_kinesis_stream_table`.
131 | (6) Select `iceberg_demo_db` as a database.
132 | (7) Choose **Kinesis** as the type of source.
133 | (8) Enter the name of the stream.
134 | (9) For the classification, choose **JSON**.
135 | (10) Define the schema according to the following table.
136 | | Column name | Data type | Example | 137 | |-------------|-----------|---------| 138 | | name | string | "Ricky" | 139 | | age | int | 23 | 140 | | m_time | string | "2023-06-13 07:24:26" | 141 | 142 | (11) Choose **Finish** 143 | 144 | 5. Upload **AWS SDK for Java 2.x** jar file into S3 145 |
146 |    (.venv) $ wget https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.17.224/aws-sdk-java-2.17.224.jar
147 |    (.venv) $ aws s3 cp aws-sdk-java-2.17.224.jar s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar
148 |    
149 | A Glue Streaming Job might fail because of the following error: 150 |
151 |    py4j.protocol.Py4JJavaError: An error occurred while calling o135.start.
152 |    : java.lang.NoSuchMethodError: software.amazon.awssdk.utils.SystemSetting.getStringValueFromEnvironmentVariable(Ljava/lang/String;)Ljava/util/Optional
153 |    
154 | We can work around the problem by starting the Glue Job with the additional parameters: 155 |
156 |    --extra-jars s3://path/to/aws-sdk-for-java-v2.jar
157 |    --user-jars-first true
158 |    
159 | In order to do this, we might need to upload **AWS SDK for Java 2.x** jar file into S3. 160 | 6. Create Glue Streaming Job 161 | 162 | * (step 1) Select one of Glue Job Scripts and upload into S3 163 | 164 | **List of Glue Job Scirpts** 165 | | File name | Spark Writes | 166 | |-----------|--------------| 167 | | spark_iceberg_writes_with_dataframe.py | DataFrame append | 168 | | spark_iceberg_writes_with_sql_insert_overwrite.py | SQL insert overwrite | 169 | | spark_iceberg_writes_with_sql_merge_into.py | SQL merge into | 170 | 171 |
172 |      (.venv) $ ls src/main/python/
173 |       spark_iceberg_writes_with_dataframe.py
174 |       spark_iceberg_writes_with_sql_insert_overwrite.py
175 |       spark_iceberg_writes_with_sql_merge_into.py
176 |      (.venv) $ aws s3 mb s3://aws-glue-assets-123456789012-atq4q5u --region us-east-1
177 |      (.venv) $ aws s3 cp src/main/python/spark_iceberg_writes_with_dataframe.py s3://aws-glue-assets-123456789012-atq4q5u/scripts/
178 |      
179 | 180 | * (step 2) Provision the Glue Streaming Job 181 | 182 |
183 |      (.venv) $ cdk deploy GlueStreamingSinkToIcebergJobRole \
184 |                           GrantLFPermissionsOnGlueJobRole \
185 |                           GlueStreamingSinkToIceberg
186 |      
187 | 7. Make sure the glue job to access the Kinesis Data Streams table in the Glue Catalog database, otherwise grant the glue job to permissions 188 | 189 | Wec can get permissions by running the following command: 190 |
191 |    (.venv) $ aws lakeformation list-permissions | jq -r '.PrincipalResourcePermissions[] | select(.Principal.DataLakePrincipalIdentifier | endswith(":role/GlueStreamingJobRole-Iceberg"))'
192 |    
193 | If not found, we need manually to grant the glue job to required permissions by running the following command: 194 |
195 |    (.venv) $ aws lakeformation grant-permissions \
196 |                --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:role/GlueStreamingJobRole-Iceberg \
197 |                --permissions SELECT DESCRIBE ALTER INSERT DELETE \
198 |                --resource '{ "Table": {"DatabaseName": "iceberg_demo_db", "TableWildcard": {}} }'
199 |    
200 | 8. Create a table with partitioned data in Amazon Athena 201 | 202 | Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
203 | * (step 1) Create a database 204 | 205 | In order to create a new database called `iceberg_demo_db`, enter the following statement in the Athena query editor 206 | and click the **Run** button to execute the query. 207 | 208 |
209 |      CREATE DATABASE IF NOT EXISTS iceberg_demo_db
210 |      
211 | 212 | * (step 2) Create a table 213 | 214 | Copy the following query into the Athena query editor, replace the `xxxxxxx` in the last line under `LOCATION` with the string of your S3 bucket, and execute the query to create a new table. 215 |
216 |       CREATE TABLE iceberg_demo_db.iceberg_demo_table (
217 |         name string,
218 |         age int,
219 |         m_time timestamp
220 |       )
221 |       PARTITIONED BY (`name`)
222 |       LOCATION 's3://glue-iceberg-demo-atq4q5u/iceberg_demo_db/iceberg_demo_table'
223 |       TBLPROPERTIES (
224 |         'table_type'='iceberg'
225 |       );
226 |       
227 | If the query is successful, a table named `iceberg_demo_table` is created and displayed on the left panel under the **Tables** section. 228 | 229 | If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have mydatabase selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**. 230 | 231 | :information_source: If you fail to create the table, give Athena users access permissions on `iceberg_demo_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant anyone using Athena to access `iceberg_demo_db` by running the following command: 232 |
233 |       (.venv) $ aws lakeformation grant-permissions \
234 |                 --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:user/example-user-id \
235 |                 --permissions CREATE_TABLE DESCRIBE ALTER DROP \
236 |                 --resource '{ "Database": { "Name": "iceberg_demo_db" } }'
237 |       (.venv) $ aws lakeformation grant-permissions \
238 |               --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:user/example-user-id \
239 |               --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
240 |               --resource '{ "Table": {"DatabaseName": "iceberg_demo_db", "TableWildcard": {}} }'
241 |       
242 | 243 | 9. Run glue job to load data from Kinesis Data Streams into S3 244 |
245 |     (.venv) $ aws glue start-job-run --job-name streaming_data_from_kds_into_iceberg_table
246 |     
247 | 10. Generate streaming data 248 | 249 | We can synthetically generate data in JSON format using a simple Python application. 250 |
251 |     (.venv) $ python src/utils/gen_fake_kinesis_stream_data.py \
252 |                --region-name us-east-1 \
253 |                --stream-name your-stream-name \
254 |                --console \
255 |                --max-count 10
256 |     
257 | 258 | Synthentic Data Example order by `name` and `m_time` 259 |
260 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
261 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
262 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
263 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
264 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
265 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
266 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
267 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
268 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
269 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
270 |     
271 | 272 | Spark Writes using `DataFrame append` insert all records into the Iceberg table. 273 |
274 |     {"name": "Arica", "age": 48, "m_time": "2023-04-11 19:13:21"}
275 |     {"name": "Arica", "age": 32, "m_time": "2023-10-20 17:24:17"}
276 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
277 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
278 |     {"name": "Gonzalo", "age": 37, "m_time": "2023-01-11 06:18:26"}
279 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
280 |     {"name": "Micheal", "age": 45, "m_time": "2023-04-07 06:18:17"}
281 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
282 |     {"name": "Takisha", "age": 48, "m_time": "2023-12-20 16:44:13"}
283 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
284 |     
285 | 286 | Spark Writes using `SQL insert overwrite` or `SQL merge into` insert the last updated records into the Iceberg table. 287 |
288 |     {"name": "Arica", "age": 45, "m_time": "2023-12-26 01:20:49"}
289 |     {"name": "Fernando", "age": 16, "m_time": "2023-05-22 00:13:55"}
290 |     {"name": "Gonzalo", "age": 60, "m_time": "2023-01-25 16:54:26"}
291 |     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
292 |     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
293 |     
294 | 11. Check streaming data in S3 295 | 296 | After `3~5` minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3**. 297 | 298 | ![iceberg-table](./assets/iceberg-table.png) 299 | ![iceberg-table](./assets/iceberg-data-level-01.png) 300 | ![iceberg-table](./assets/iceberg-data-level-02.png) 301 | ![iceberg-table](./assets/iceberg-data-level-03.png) 302 | 303 | 12. Run test query 304 | 305 | Enter the following SQL statement and execute the query. 306 |
307 |     SELECT COUNT(*)
308 |     FROM iceberg_demo_db.iceberg_demo_table;
309 |     
310 | 311 | ## Clean Up 312 | 313 | 1. Stop the glue job by replacing the job name in below command. 314 | 315 |
316 |    (.venv) $ JOB_RUN_IDS=$(aws glue get-job-runs \
317 |               --job-name streaming_data_from_kds_into_iceberg_table | jq -r '.JobRuns[] | select(.JobRunState=="RUNNING") | .Id' \
318 |               | xargs)
319 |    (.venv) $ aws glue batch-stop-job-run \
320 |               --job-name streaming_data_from_kds_into_iceberg_table \
321 |               --job-run-ids $JOB_RUN_IDS
322 |    
323 | 324 | 2. Delete the CloudFormation stack by running the below command. 325 | 326 |
327 |    (.venv) $ cdk destroy --all
328 |    
329 | 330 | ## Useful commands 331 | 332 | * `cdk ls` list all stacks in the app 333 | * `cdk synth` emits the synthesized CloudFormation template 334 | * `cdk deploy` deploy this stack to your default AWS account/region 335 | * `cdk diff` compare deployed stack with current state 336 | * `cdk docs` open CDK documentation 337 | 338 | ## References 339 | 340 | * (1) [AWS Glue versions](https://docs.aws.amazon.com/glue/latest/dg/release-notes.html): The AWS Glue version determines the versions of Apache Spark and Python that AWS Glue supports. 341 | * (2) [Use the AWS Glue connector to read and write Apache Iceberg tables with ACID transactions and perform time travel \(2022-06-21\)](https://aws.amazon.com/ko/blogs/big-data/use-the-aws-glue-connector-to-read-and-write-apache-iceberg-tables-with-acid-transactions-and-perform-time-travel/) 342 | * (3) [Streaming Data into Apache Iceberg Tables Using AWS Kinesis and AWS Glue (2022-09-26)](https://www.dremio.com/subsurface/streaming-data-into-apache-iceberg-tables-using-aws-kinesis-and-aws-glue/) 343 | * (4) [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html) 344 | * (5) [Streaming ETL jobs in AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html) 345 | * (6) [AWS Glue job parameters](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) 346 | * (7) [Crafting serverless streaming ETL jobs with AWS Glue](https://aws.amazon.com/ko/blogs/big-data/crafting-serverless-streaming-etl-jobs-with-aws-glue/) 347 | * (8) [Apache Iceberg - Spark Writes with SQL (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-writes/) 348 | * (9) [Apache Iceberg - Spark Structured Streaming (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/) 349 | * (10) [Apache Iceberg - Writing against partitioned table (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table) 350 | * Iceberg supports append and complete output modes: 351 | * `append`: appends the rows of every micro-batch to the table 352 | * `complete`: replaces the table contents every micro-batch 353 | 354 | Iceberg requires the data to be sorted according to the partition spec per task (Spark partition) in prior to write against partitioned table.
355 | Otherwise, you might encounter the following error: 356 |
357 |        pyspark.sql.utils.AnalysisException: Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets;
358 |        
359 | * (10) [Apache Iceberg - Maintenance for streaming tables (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#maintenance-for-streaming-tables) 360 | * (11) [awsglue python package](https://github.com/awslabs/aws-glue-libs): The awsglue Python package contains the Python portion of the AWS Glue library. This library extends PySpark to support serverless ETL on AWS. 361 | * (12) [AWS Glue Notebook Samples](https://github.com/aws-samples/aws-glue-samples/tree/master/examples/notebooks) - sample iPython notebook files which show you how to use open data dake formats; Apache Hudi, Delta Lake, and Apache Iceberg on AWS Glue Interactive Sessions and AWS Glue Studio Notebook. 362 | 363 | ## Troubleshooting 364 | 365 | * Granting database or table permissions error using AWS CDK 366 | * Error message: 367 |
368 |      AWS::LakeFormation::PrincipalPermissions | CfnPrincipalPermissions Resource handler returned message: "Resource does not exist or requester is not authorized to access requested permissions. (Service: LakeFormation, Status Code: 400, Request ID: f4d5e58b-29b6-4889-9666-7e38420c9035)" (RequestToken: 4a4bb1d6-b051-032f-dd12-5951d7b4d2a9, HandlerErrorCode: AccessDenied)
369 |      
370 | * Solution: 371 | 372 | The role assumed by cdk is not a data lake administrator. (e.g., `cdk-hnb659fds-deploy-role-12345678912-us-east-1`)
373 | So, deploying PrincipalPermissions meets the error such as: 374 | 375 | `Resource does not exist or requester is not authorized to access requested permissions.` 376 | 377 | In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
378 | For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68 379 | 380 | * Reference: 381 | 382 | [https://github.com/aws-samples/data-lake-as-code](https://github.com/aws-samples/data-lake-as-code) - Data Lake as Code 383 | 384 | ## Security 385 | 386 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 387 | 388 | ## License 389 | 390 | This library is licensed under the MIT-0 License. See the LICENSE file. 391 | 392 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import aws_cdk as cdk 5 | 6 | from cdk_stacks import ( 7 | KdsStack, 8 | GlueJobRoleStack, 9 | GlueStreamDataSchemaStack, 10 | GlueStreamingJobStack, 11 | DataLakePermissionsStack, 12 | S3BucketStack 13 | ) 14 | 15 | APP_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), 16 | region=os.getenv('CDK_DEFAULT_REGION')) 17 | 18 | app = cdk.App() 19 | 20 | s3_bucket = S3BucketStack(app, 'IcebergS3Path') 21 | 22 | kds_stack = KdsStack(app, 'KinesisStreamAsGlueStreamingJobDataSource') 23 | kds_stack.add_dependency(s3_bucket) 24 | 25 | glue_job_role = GlueJobRoleStack(app, 'GlueStreamingSinkToIcebergJobRole') 26 | glue_job_role.add_dependency(kds_stack) 27 | 28 | glue_stream_schema = GlueStreamDataSchemaStack(app, 'GlueSchemaOnKinesisStream', 29 | kds_stack.kinesis_stream 30 | ) 31 | glue_stream_schema.add_dependency(kds_stack) 32 | 33 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'GrantLFPermissionsOnGlueJobRole', 34 | glue_job_role.glue_job_role 35 | ) 36 | grant_lake_formation_permissions.add_dependency(glue_job_role) 37 | grant_lake_formation_permissions.add_dependency(glue_stream_schema) 38 | 39 | glue_streaming_job = GlueStreamingJobStack(app, 'GlueStreamingSinkToIceberg', 40 | glue_job_role.glue_job_role, 41 | kds_stack.kinesis_stream 42 | ) 43 | glue_streaming_job.add_dependency(grant_lake_formation_permissions) 44 | 45 | app.synth() 46 | -------------------------------------------------------------------------------- /assets/iceberg-data-level-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-etl-with-apache-iceberg/19323e67a5b424462088c58761cc5fce7d4680d4/assets/iceberg-data-level-01.png -------------------------------------------------------------------------------- /assets/iceberg-data-level-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-etl-with-apache-iceberg/19323e67a5b424462088c58761cc5fce7d4680d4/assets/iceberg-data-level-02.png -------------------------------------------------------------------------------- /assets/iceberg-data-level-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-etl-with-apache-iceberg/19323e67a5b424462088c58761cc5fce7d4680d4/assets/iceberg-data-level-03.png -------------------------------------------------------------------------------- /assets/iceberg-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-streaming-etl-with-apache-iceberg/19323e67a5b424462088c58761cc5fce7d4680d4/assets/iceberg-table.png -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 19 | "@aws-cdk/core:checkSecretUsage": true, 20 | "@aws-cdk/core:target-partitions": [ 21 | "aws", 22 | "aws-cn" 23 | ], 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 29 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 30 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 31 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 32 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 33 | "@aws-cdk/core:enablePartitionLiterals": true, 34 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 35 | "@aws-cdk/aws-iam:standardizedServicePrincipals": true, 36 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 37 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /cdk_stacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .kds import KdsStack 2 | from .glue_job_role import GlueJobRoleStack 3 | from .glue_stream_data_schema import GlueStreamDataSchemaStack 4 | from .glue_streaming_job import GlueStreamingJobStack 5 | from .lakeformation_permissions import DataLakePermissionsStack 6 | from .s3 import S3BucketStack 7 | -------------------------------------------------------------------------------- /cdk_stacks/glue_job_role.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_iam 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class GlueJobRoleStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | glue_job_role_policy_doc = aws_iam.PolicyDocument() 20 | glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 21 | "sid": "AWSGlueJobDynamoDBAccess", 22 | "effect": aws_iam.Effect.ALLOW, 23 | #XXX: The ARN will be formatted as follows: 24 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name} 25 | "resources": [self.format_arn(service="dynamodb", resource="table", resource_name="*")], 26 | "actions": [ 27 | "dynamodb:BatchGetItem", 28 | "dynamodb:DescribeStream", 29 | "dynamodb:DescribeTable", 30 | "dynamodb:GetItem", 31 | "dynamodb:Query", 32 | "dynamodb:Scan", 33 | "dynamodb:BatchWriteItem", 34 | "dynamodb:CreateTable", 35 | "dynamodb:DeleteTable", 36 | "dynamodb:DeleteItem", 37 | "dynamodb:UpdateTable", 38 | "dynamodb:UpdateItem", 39 | "dynamodb:PutItem" 40 | ] 41 | })) 42 | 43 | glue_job_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 44 | "sid": "AWSGlueJobS3Access", 45 | "effect": aws_iam.Effect.ALLOW, 46 | #XXX: The ARN will be formatted as follows: 47 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name} 48 | "resources": ["*"], 49 | "actions": [ 50 | "s3:GetBucketLocation", 51 | "s3:ListBucket", 52 | "s3:GetBucketAcl", 53 | "s3:GetObject", 54 | "s3:PutObject", 55 | "s3:DeleteObject" 56 | ] 57 | })) 58 | 59 | glue_job_role = aws_iam.Role(self, 'GlueJobRole', 60 | role_name='GlueStreamingJobRole-Iceberg', 61 | assumed_by=aws_iam.ServicePrincipal('glue.amazonaws.com'), 62 | inline_policies={ 63 | 'aws_glue_job_role_policy': glue_job_role_policy_doc 64 | }, 65 | managed_policies=[ 66 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole'), 67 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMReadOnlyAccess'), 68 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'), 69 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AWSGlueConsoleFullAccess'), 70 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess') 71 | ] 72 | ) 73 | 74 | #XXX: When creating a notebook with a role, that role is then passed to interactive sessions 75 | # so that the same role can be used in both places. 76 | # As such, the `iam:PassRole` permission needs to be part of the role's policy. 77 | # More info at: https://docs.aws.amazon.com/glue/latest/ug/notebook-getting-started.html 78 | # 79 | glue_job_role.add_to_policy(aws_iam.PolicyStatement(**{ 80 | "sid": "AWSGlueJobIAMPassRole", 81 | "effect": aws_iam.Effect.ALLOW, 82 | #XXX: The ARN will be formatted as follows: 83 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}{resource-name} 84 | "resources": [self.format_arn(service="iam", region="", resource="role", resource_name=glue_job_role.role_name)], 85 | "conditions": { 86 | "StringLike": { 87 | "iam:PassedToService": [ 88 | "glue.amazonaws.com" 89 | ] 90 | } 91 | }, 92 | "actions": [ 93 | "iam:PassRole" 94 | ] 95 | })) 96 | 97 | self.glue_job_role = glue_job_role 98 | 99 | cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRole', value=self.glue_job_role.role_name) 100 | cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=self.glue_job_role.role_arn) 101 | -------------------------------------------------------------------------------- /cdk_stacks/glue_stream_data_schema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_glue 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class GlueStreamDataSchemaStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, kinesis_stream, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | glue_kinesis_table = self.node.try_get_context('glue_kinesis_table') 20 | database_name = glue_kinesis_table['database_name'] 21 | table_name = glue_kinesis_table['table_name'] 22 | columns = glue_kinesis_table.get('columns', []) 23 | 24 | cfn_database = aws_glue.CfnDatabase(self, "GlueCfnDatabase", 25 | catalog_id=cdk.Aws.ACCOUNT_ID, 26 | database_input=aws_glue.CfnDatabase.DatabaseInputProperty( 27 | name=database_name 28 | ) 29 | ) 30 | cfn_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 31 | 32 | cfn_table = aws_glue.CfnTable(self, "GlueCfnTable", 33 | catalog_id=cdk.Aws.ACCOUNT_ID, 34 | database_name=database_name, 35 | table_input=aws_glue.CfnTable.TableInputProperty( 36 | name=table_name, 37 | parameters={"classification": "json"}, 38 | storage_descriptor=aws_glue.CfnTable.StorageDescriptorProperty( 39 | columns=columns, 40 | input_format="org.apache.hadoop.mapred.TextInputFormat", 41 | location=kinesis_stream.stream_name, 42 | output_format="org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", 43 | parameters={ 44 | "streamARN": kinesis_stream.stream_arn, 45 | "typeOfData": "kinesis" 46 | }, 47 | serde_info=aws_glue.CfnTable.SerdeInfoProperty( 48 | serialization_library="org.openx.data.jsonserde.JsonSerDe" 49 | ) 50 | ), 51 | table_type="EXTERNAL_TABLE" 52 | ) 53 | ) 54 | 55 | cfn_table.add_dependency(cfn_database) 56 | cfn_table.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 57 | 58 | cdk.CfnOutput(self, f'{self.stack_name}_GlueDatabaseName', value=cfn_table.database_name) 59 | -------------------------------------------------------------------------------- /cdk_stacks/glue_streaming_job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_glue, 10 | aws_s3 as s3, 11 | ) 12 | from constructs import Construct 13 | 14 | 15 | class GlueStreamingJobStack(Stack): 16 | 17 | def __init__(self, scope: Construct, construct_id: str, glue_job_role, kinesis_stream, **kwargs) -> None: 18 | super().__init__(scope, construct_id, **kwargs) 19 | 20 | glue_assets_s3_bucket_name = self.node.try_get_context('glue_assets_s3_bucket_name') 21 | glue_job_script_file_name = self.node.try_get_context('glue_job_script_file_name') 22 | glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments') 23 | 24 | glue_job_default_arguments = { 25 | "--kinesis_stream_arn": kinesis_stream.stream_arn, 26 | "--enable-metrics": "true", 27 | "--enable-spark-ui": "true", 28 | "--spark-event-logs-path": f"s3://{glue_assets_s3_bucket_name}/sparkHistoryLogs/", 29 | "--enable-job-insights": "false", 30 | "--enable-glue-datacatalog": "true", 31 | "--enable-continuous-cloudwatch-log": "true", 32 | "--job-bookmark-option": "job-bookmark-disable", 33 | "--job-language": "python", 34 | "--TempDir": f"s3://{glue_assets_s3_bucket_name}/temporary/" 35 | } 36 | 37 | glue_job_default_arguments.update(glue_job_input_arguments) 38 | 39 | glue_job_name = self.node.try_get_context('glue_job_name') 40 | 41 | glue_connections_name = self.node.try_get_context('glue_connections_name') 42 | 43 | glue_cfn_job = aws_glue.CfnJob(self, "GlueStreamingETLJob", 44 | command=aws_glue.CfnJob.JobCommandProperty( 45 | name="gluestreaming", 46 | python_version="3", 47 | script_location="s3://{glue_assets}/scripts/{glue_job_script_file_name}".format( 48 | glue_assets=glue_assets_s3_bucket_name, 49 | glue_job_script_file_name=glue_job_script_file_name 50 | ) 51 | ), 52 | role=glue_job_role.role_arn, 53 | 54 | #XXX: Set only AllocatedCapacity or MaxCapacity 55 | # Do not set Allocated Capacity if using Worker Type and Number of Workers 56 | # allocated_capacity=2, 57 | connections=aws_glue.CfnJob.ConnectionsListProperty( 58 | connections=[glue_connections_name] 59 | ), 60 | default_arguments=glue_job_default_arguments, 61 | description="This job loads the data from Kinesis Data Streams to S3.", 62 | execution_property=aws_glue.CfnJob.ExecutionPropertyProperty( 63 | max_concurrent_runs=1 64 | ), 65 | #XXX: check AWS Glue Version in https://docs.aws.amazon.com/glue/latest/dg/add-job.html#create-job 66 | glue_version="3.0", 67 | #XXX: Do not set Max Capacity if using Worker Type and Number of Workers 68 | # max_capacity=2, 69 | max_retries=0, 70 | name=glue_job_name, 71 | # notification_property=aws_glue.CfnJob.NotificationPropertyProperty( 72 | # notify_delay_after=10 # 10 minutes 73 | # ), 74 | number_of_workers=2, 75 | timeout=2880, 76 | worker_type="G.1X" # ['Standard' | 'G.1X' | 'G.2X' | 'G.025x'] 77 | ) 78 | 79 | cdk.CfnOutput(self, f'{self.stack_name}_GlueJobName', value=glue_cfn_job.name) 80 | cdk.CfnOutput(self, f'{self.stack_name}_GlueJobRoleArn', value=glue_job_role.role_arn) 81 | -------------------------------------------------------------------------------- /cdk_stacks/kds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Duration, 12 | Stack, 13 | aws_kinesis, 14 | ) 15 | from constructs import Construct 16 | 17 | random.seed(23) 18 | 19 | 20 | class KdsStack(Stack): 21 | 22 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | KINESIS_DEFAULT_STREAM_NAME = 'PUT-{}'.format(''.join(random.sample((string.ascii_letters), k=5))) 26 | KINESIS_STREAM_NAME = self.node.try_get_context('kinesis_stream_name') or KINESIS_DEFAULT_STREAM_NAME 27 | 28 | source_kinesis_stream = aws_kinesis.Stream(self, "SourceKinesisStreams", 29 | retention_period=Duration.hours(24), 30 | stream_mode=aws_kinesis.StreamMode.ON_DEMAND, 31 | stream_name=KINESIS_STREAM_NAME) 32 | 33 | self.kinesis_stream = source_kinesis_stream 34 | 35 | cdk.CfnOutput(self, f'{self.stack_name}_KinesisDataStreamName', value=self.kinesis_stream.stream_name) 36 | 37 | -------------------------------------------------------------------------------- /cdk_stacks/lakeformation_permissions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_lakeformation 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class DataLakePermissionsStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, glue_job_role, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | glue_job_input_arguments = self.node.try_get_context('glue_kinesis_table') 20 | database_name = glue_job_input_arguments["database_name"] 21 | 22 | #XXXX: The role assumed by cdk is not a data lake administrator. 23 | # So, deploying PrincipalPermissions meets the error such as: 24 | # "Resource does not exist or requester is not authorized to access requested permissions." 25 | # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator. 26 | # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68 27 | cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings", 28 | admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( 29 | data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn) 30 | )] 31 | ) 32 | 33 | cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions", 34 | permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"], 35 | permissions_with_grant_option=[], 36 | principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty( 37 | data_lake_principal_identifier=glue_job_role.role_arn 38 | ), 39 | resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty( 40 | #XXX: Can't specify a TableWithColumns resource and a Table resource 41 | table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty( 42 | catalog_id=cdk.Aws.ACCOUNT_ID, 43 | database_name=database_name, 44 | # name="ALL_TABLES", 45 | table_wildcard={} 46 | ) 47 | ) 48 | ) 49 | cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 50 | 51 | #XXX: In order to keep resource destruction order, 52 | # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions 53 | cfn_principal_permissions.add_dependency(cfn_data_lake_settings) 54 | 55 | cdk.CfnOutput(self, f'{self.stack_name}_Principal', 56 | value=cfn_principal_permissions.attr_principal_identifier) 57 | -------------------------------------------------------------------------------- /cdk_stacks/s3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | from urllib.parse import urlparse 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_s3 as s3 12 | ) 13 | 14 | from constructs import Construct 15 | 16 | 17 | class S3BucketStack(Stack): 18 | 19 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 20 | super().__init__(scope, construct_id, **kwargs) 21 | 22 | glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments') 23 | s3_path = glue_job_input_arguments["--iceberg_s3_path"] 24 | s3_bucket_name = urlparse(s3_path).netloc 25 | 26 | s3_bucket = s3.Bucket(self, "s3bucket", 27 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned 28 | bucket_name=s3_bucket_name) 29 | 30 | self.s3_bucket_name = s3_bucket.bucket_name 31 | 32 | cdk.CfnOutput(self, f'{self.stack_name}_S3Bucket', value=self.s3_bucket_name) 33 | -------------------------------------------------------------------------------- /glue-streaming-data-to-iceberg-table.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
AWS Cloud
AWS Cloud
Glue Data Catalog
Glue Data Catalog
Kinesis Data Streams
Kinesis Data Streams
Glue Streaming
Glue Streaming
Athena
Athena
S3
S3
Python Data Generator
Python Data Generator
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.24.41 2 | mimesis==6.0.0 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.59.0 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /src/main/python/spark_iceberg_writes_with_dataframe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | import sys 7 | import re 8 | 9 | from awsglue.transforms import * 10 | from awsglue.utils import getResolvedOptions 11 | from pyspark.context import SparkContext 12 | from awsglue.context import GlueContext 13 | from awsglue.job import Job 14 | from awsglue import DynamicFrame 15 | 16 | from pyspark.conf import SparkConf 17 | from pyspark.sql import DataFrame, Row 18 | from pyspark.sql.types import * 19 | from pyspark.sql.functions import * 20 | 21 | 22 | def get_kinesis_stream_name_from_arn(stream_arn): 23 | ARN_PATTERN = re.compile(r'arn:aws:kinesis:([a-z0-9-]+):(\d+):stream/([a-zA-Z0-9-_]+)') 24 | results = ARN_PATTERN.match(stream_arn) 25 | return results.group(3) 26 | 27 | args = getResolvedOptions(sys.argv, ['JOB_NAME', 28 | 'catalog', 29 | 'database_name', 30 | 'table_name', 31 | 'kinesis_table_name', 32 | 'kinesis_stream_arn', 33 | 'starting_position_of_kinesis_iterator', 34 | 'iceberg_s3_path', 35 | 'lock_table_name', 36 | 'aws_region', 37 | 'window_size' 38 | ]) 39 | 40 | CATALOG = args['catalog'] 41 | 42 | ICEBERG_S3_PATH = args['iceberg_s3_path'] 43 | DATABASE = args['database_name'] 44 | TABLE_NAME = args['table_name'] 45 | DYNAMODB_LOCK_TABLE = args['lock_table_name'] 46 | 47 | KINESIS_TABLE_NAME = args['kinesis_table_name'] 48 | KINESIS_STREAM_ARN = args['kinesis_stream_arn'] 49 | KINESIS_STREAM_NAME = get_kinesis_stream_name_from_arn(KINESIS_STREAM_ARN) 50 | 51 | #XXX: starting_position_of_kinesis_iterator: ['LATEST', 'TRIM_HORIZON'] 52 | STARTING_POSITION_OF_KINESIS_ITERATOR = args.get('starting_position_of_kinesis_iterator', 'LATEST') 53 | 54 | AWS_REGION = args['aws_region'] 55 | WINDOW_SIZE = args.get('window_size', '100 seconds') 56 | 57 | def setSparkIcebergConf() -> SparkConf: 58 | conf_list = [ 59 | (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"), 60 | (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH), 61 | (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"), 62 | (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"), 63 | (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"), 64 | (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE), 65 | ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"), 66 | ("spark.sql.iceberg.handle-timestamp-without-timezone", "true") 67 | ] 68 | spark_conf = SparkConf().setAll(conf_list) 69 | return spark_conf 70 | 71 | # Set the Spark + Glue context 72 | conf = setSparkIcebergConf() 73 | sc = SparkContext(conf=conf) 74 | glueContext = GlueContext(sc) 75 | spark = glueContext.spark_session 76 | job = Job(glueContext) 77 | job.init(args['JOB_NAME'], args) 78 | 79 | # Read from Kinesis Data Stream 80 | streaming_data = spark.readStream \ 81 | .format("kinesis") \ 82 | .option("streamName", KINESIS_STREAM_NAME) \ 83 | .option("endpointUrl", f"https://kinesis.{AWS_REGION}.amazonaws.com") \ 84 | .option("startingPosition", STARTING_POSITION_OF_KINESIS_ITERATOR) \ 85 | .load() 86 | 87 | streaming_data_df = streaming_data \ 88 | .select(from_json(col("data").cast("string"), \ 89 | glueContext.get_catalog_schema_as_spark_schema(DATABASE, KINESIS_TABLE_NAME)) \ 90 | .alias("source_table")) \ 91 | .select("source_table.*") \ 92 | .withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss')) 93 | 94 | table_identifier = f"{CATALOG}.{DATABASE}.{TABLE_NAME}" 95 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/") 96 | 97 | #XXX: Writing against partitioned table 98 | # https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#writing-against-partitioned-table 99 | # Complete output mode not supported when there are no streaming aggregations on streaming DataFrame/Datasets 100 | query = streaming_data_df.writeStream \ 101 | .format("iceberg") \ 102 | .outputMode("append") \ 103 | .trigger(processingTime=WINDOW_SIZE) \ 104 | .option("path", table_identifier) \ 105 | .option("fanout-enabled", "true") \ 106 | .option("checkpointLocation", checkpointPath) \ 107 | .start() 108 | 109 | query.awaitTermination() 110 | -------------------------------------------------------------------------------- /src/main/python/spark_iceberg_writes_with_sql_insert_overwrite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | import sys 7 | import traceback 8 | 9 | from awsglue.transforms import * 10 | from awsglue.utils import getResolvedOptions 11 | from pyspark.context import SparkContext 12 | from awsglue.context import GlueContext 13 | from awsglue.job import Job 14 | from awsglue import DynamicFrame 15 | 16 | from pyspark.conf import SparkConf 17 | from pyspark.sql import DataFrame, Row 18 | from pyspark.sql.window import Window 19 | from pyspark.sql.functions import ( 20 | col, 21 | desc, 22 | row_number, 23 | to_timestamp 24 | ) 25 | 26 | 27 | args = getResolvedOptions(sys.argv, ['JOB_NAME', 28 | 'catalog', 29 | 'database_name', 30 | 'table_name', 31 | 'primary_key', 32 | 'kinesis_stream_arn', 33 | 'starting_position_of_kinesis_iterator', 34 | 'iceberg_s3_path', 35 | 'lock_table_name', 36 | 'aws_region', 37 | 'window_size' 38 | ]) 39 | 40 | CATALOG = args['catalog'] 41 | ICEBERG_S3_PATH = args['iceberg_s3_path'] 42 | DATABASE = args['database_name'] 43 | TABLE_NAME = args['table_name'] 44 | PRIMARY_KEY = args['primary_key'] 45 | DYNAMODB_LOCK_TABLE = args['lock_table_name'] 46 | KINESIS_STREAM_ARN = args['kinesis_stream_arn'] 47 | #XXX: starting_position_of_kinesis_iterator: ['LATEST', 'TRIM_HORIZON'] 48 | STARTING_POSITION_OF_KINESIS_ITERATOR = args.get('starting_position_of_kinesis_iterator', 'LATEST') 49 | AWS_REGION = args['aws_region'] 50 | WINDOW_SIZE = args.get('window_size', '100 seconds') 51 | 52 | def setSparkIcebergConf() -> SparkConf: 53 | conf_list = [ 54 | (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"), 55 | (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH), 56 | (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"), 57 | (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"), 58 | (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"), 59 | (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE), 60 | ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"), 61 | ("spark.sql.iceberg.handle-timestamp-without-timezone", "true") 62 | ] 63 | spark_conf = SparkConf().setAll(conf_list) 64 | return spark_conf 65 | 66 | # Set the Spark + Glue context 67 | conf = setSparkIcebergConf() 68 | sc = SparkContext(conf=conf) 69 | glueContext = GlueContext(sc) 70 | spark = glueContext.spark_session 71 | job = Job(glueContext) 72 | job.init(args['JOB_NAME'], args) 73 | 74 | kds_df = glueContext.create_data_frame.from_options( 75 | connection_type="kinesis", 76 | connection_options={ 77 | "typeOfData": "kinesis", 78 | "streamARN": KINESIS_STREAM_ARN, 79 | "classification": "json", 80 | "startingPosition": f"{STARTING_POSITION_OF_KINESIS_ITERATOR}", 81 | "inferSchema": "true", 82 | }, 83 | transformation_ctx="kds_df", 84 | ) 85 | 86 | def processBatch(data_frame, batch_id): 87 | if data_frame.count() > 0: 88 | stream_data_dynf = DynamicFrame.fromDF( 89 | data_frame, glueContext, "from_data_frame" 90 | ) 91 | 92 | _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0") 93 | 94 | # Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation 95 | window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time")) 96 | stream_data_df = stream_data_dynf.toDF() 97 | stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss')) 98 | upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \ 99 | .filter(col("row") == 1).drop("row") \ 100 | .select(_df.schema.names) 101 | 102 | upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert") 103 | # print(f"Table '{TABLE_NAME}' is upserting...") 104 | 105 | sql_query = f""" 106 | INSERT OVERWRITE {CATALOG}.{DATABASE}.{TABLE_NAME} SELECT * FROM {TABLE_NAME}_upsert 107 | """ 108 | try: 109 | spark.sql(sql_query) 110 | except Exception as ex: 111 | traceback.print_exc() 112 | raise ex 113 | 114 | 115 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/") 116 | 117 | glueContext.forEachBatch( 118 | frame=kds_df, 119 | batch_function=processBatch, 120 | options={ 121 | "windowSize": WINDOW_SIZE, 122 | "checkpointLocation": checkpointPath, 123 | } 124 | ) 125 | 126 | job.commit() 127 | -------------------------------------------------------------------------------- /src/main/python/spark_iceberg_writes_with_sql_merge_into.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | import sys 7 | import traceback 8 | 9 | from awsglue.transforms import * 10 | from awsglue.utils import getResolvedOptions 11 | from awsglue.context import GlueContext 12 | from awsglue.job import Job 13 | from awsglue import DynamicFrame 14 | 15 | from pyspark.context import SparkContext 16 | from pyspark.conf import SparkConf 17 | from pyspark.sql import DataFrame, Row 18 | from pyspark.sql.window import Window 19 | from pyspark.sql.functions import ( 20 | col, 21 | desc, 22 | row_number, 23 | to_timestamp 24 | ) 25 | 26 | args = getResolvedOptions(sys.argv, ['JOB_NAME', 27 | 'catalog', 28 | 'database_name', 29 | 'table_name', 30 | 'primary_key', 31 | 'kinesis_stream_arn', 32 | 'starting_position_of_kinesis_iterator', 33 | 'iceberg_s3_path', 34 | 'lock_table_name', 35 | 'aws_region', 36 | 'window_size' 37 | ]) 38 | 39 | CATALOG = args['catalog'] 40 | ICEBERG_S3_PATH = args['iceberg_s3_path'] 41 | DATABASE = args['database_name'] 42 | TABLE_NAME = args['table_name'] 43 | PRIMARY_KEY = args['primary_key'] 44 | DYNAMODB_LOCK_TABLE = args['lock_table_name'] 45 | KINESIS_STREAM_ARN = args['kinesis_stream_arn'] 46 | #XXX: starting_position_of_kinesis_iterator: ['LATEST', 'TRIM_HORIZON'] 47 | STARTING_POSITION_OF_KINESIS_ITERATOR = args.get('starting_position_of_kinesis_iterator', 'LATEST') 48 | AWS_REGION = args['aws_region'] 49 | WINDOW_SIZE = args.get('window_size', '100 seconds') 50 | 51 | def setSparkIcebergConf() -> SparkConf: 52 | conf_list = [ 53 | (f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog"), 54 | (f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_PATH), 55 | (f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"), 56 | (f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"), 57 | (f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager"), 58 | (f"spark.sql.catalog.{CATALOG}.lock.table", DYNAMODB_LOCK_TABLE), 59 | ("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"), 60 | ("spark.sql.iceberg.handle-timestamp-without-timezone", "true") 61 | ] 62 | spark_conf = SparkConf().setAll(conf_list) 63 | return spark_conf 64 | 65 | # Set the Spark + Glue context 66 | conf = setSparkIcebergConf() 67 | sc = SparkContext(conf=conf) 68 | glueContext = GlueContext(sc) 69 | spark = glueContext.spark_session 70 | job = Job(glueContext) 71 | job.init(args['JOB_NAME'], args) 72 | 73 | kds_df = glueContext.create_data_frame.from_options( 74 | connection_type="kinesis", 75 | connection_options={ 76 | "typeOfData": "kinesis", 77 | "streamARN": KINESIS_STREAM_ARN, 78 | "classification": "json", 79 | "startingPosition": f"{STARTING_POSITION_OF_KINESIS_ITERATOR}", 80 | "inferSchema": "true", 81 | }, 82 | transformation_ctx="kds_df", 83 | ) 84 | 85 | def processBatch(data_frame, batch_id): 86 | if data_frame.count() > 0: 87 | stream_data_dynf = DynamicFrame.fromDF( 88 | data_frame, glueContext, "from_data_frame" 89 | ) 90 | 91 | tables_df = spark.sql(f"SHOW TABLES IN {CATALOG}.{DATABASE}") 92 | table_list = tables_df.select('tableName').rdd.flatMap(lambda x: x).collect() 93 | if f"{TABLE_NAME}" not in table_list: 94 | print(f"Table {TABLE_NAME} doesn't exist in {CATALOG}.{DATABASE}.") 95 | else: 96 | _df = spark.sql(f"SELECT * FROM {CATALOG}.{DATABASE}.{TABLE_NAME} LIMIT 0") 97 | 98 | # Apply De-duplication logic on input data to pick up the latest record based on timestamp and operation 99 | window = Window.partitionBy(PRIMARY_KEY).orderBy(desc("m_time")) 100 | stream_data_df = stream_data_dynf.toDF() 101 | stream_data_df = stream_data_df.withColumn('m_time', to_timestamp(col('m_time'), 'yyyy-MM-dd HH:mm:ss')) 102 | upsert_data_df = stream_data_df.withColumn("row", row_number().over(window)) \ 103 | .filter(col("row") == 1).drop("row") \ 104 | .select(_df.schema.names) 105 | 106 | upsert_data_df.createOrReplaceTempView(f"{TABLE_NAME}_upsert") 107 | # print(f"Table '{TABLE_NAME}' is upserting...") 108 | 109 | try: 110 | spark.sql(f"""MERGE INTO {CATALOG}.{DATABASE}.{TABLE_NAME} t 111 | USING {TABLE_NAME}_upsert s ON s.{PRIMARY_KEY} = t.{PRIMARY_KEY} 112 | WHEN MATCHED THEN UPDATE SET * 113 | WHEN NOT MATCHED THEN INSERT * 114 | """) 115 | except Exception as ex: 116 | traceback.print_exc() 117 | raise ex 118 | 119 | checkpointPath = os.path.join(args["TempDir"], args["JOB_NAME"], "checkpoint/") 120 | 121 | glueContext.forEachBatch( 122 | frame=kds_df, 123 | batch_function=processBatch, 124 | options={ 125 | "windowSize": WINDOW_SIZE, 126 | "checkpointLocation": checkpointPath, 127 | } 128 | ) 129 | 130 | job.commit() 131 | -------------------------------------------------------------------------------- /src/utils/gen_fake_kinesis_stream_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import sys 6 | import argparse 7 | from datetime import datetime 8 | import json 9 | import random 10 | import time 11 | 12 | import boto3 13 | from mimesis.locales import Locale 14 | from mimesis.schema import Field, Schema 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument('--region-name', action='store', default='us-east-1', 21 | help='aws region name (default: us-east-1)') 22 | parser.add_argument('--stream-name', help='The name of the stream to put the data record into') 23 | parser.add_argument('--max-count', default=10, type=int, help='The max number of records to put (default: 10)') 24 | parser.add_argument('--dry-run', action='store_true') 25 | parser.add_argument('--console', action='store_true', help='Print out records ingested into the stream') 26 | 27 | options = parser.parse_args() 28 | 29 | _CURRENT_YEAR = datetime.now().year 30 | _NAMES = 'Arica,Burton,Cory,Fernando,Gonzalo,Kenton,Linsey,Micheal,Ricky,Takisha'.split(',') 31 | 32 | #XXX: For more information about synthetic data schema, see 33 | # https://github.com/aws-samples/aws-glue-streaming-etl-blog/blob/master/config/generate_data.py 34 | _ = Field(locale=Locale.EN) 35 | 36 | _schema = Schema(schema=lambda: { 37 | # "name": _("first_name"), 38 | "name": _("choice", items=_NAMES), 39 | "age": _("age"), 40 | "m_time": _("formatted_datetime", fmt="%Y-%m-%d %H:%M:%S", start=_CURRENT_YEAR, end=_CURRENT_YEAR) 41 | }) 42 | 43 | if not options.dry_run: 44 | kinesis_streams_client = boto3.client('kinesis', region_name=options.region_name) 45 | 46 | cnt = 0 47 | for record in _schema.iterator(options.max_count): 48 | cnt += 1 49 | 50 | if options.dry_run: 51 | print(f"{json.dumps(record)}") 52 | else: 53 | res = kinesis_streams_client.put_record( 54 | StreamName=options.stream_name, 55 | Data=f"{json.dumps(record)}\n", # convert JSON to JSON Line 56 | PartitionKey=f"{record['name']}" 57 | ) 58 | 59 | if options.console: 60 | print(f"{json.dumps(record)}") 61 | 62 | if cnt % 100 == 0: 63 | print(f'[INFO] {cnt} records are processed', file=sys.stderr) 64 | 65 | if res['ResponseMetadata']['HTTPStatusCode'] != 200: 66 | print(res, file=sys.stderr) 67 | time.sleep(random.choices([0.01, 0.03, 0.05, 0.07, 0.1])[-1]) 68 | print(f'[INFO] Total {cnt} records are processed', file=sys.stderr) 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | --------------------------------------------------------------------------------