├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── web-analytics-datafirehose-iceberg ├── .example.cdk.context.json ├── .gitignore ├── README.md ├── app.py ├── assets │ ├── amazon-athena-switching-to-workgroup.png │ ├── wa-iceberg-data-level-01.png │ ├── wa-iceberg-data-level-02.png │ ├── wa-iceberg-data-level-03.png │ └── wa-iceberg-table.png ├── build-aws-lambda-layer-package.sh ├── cdk.json ├── cdk_stacks │ ├── __init__.py │ ├── apigw.py │ ├── firehose_data_proc_lambda.py │ ├── firehose_role.py │ ├── firehose_to_iceberg.py │ ├── lake_formation.py │ └── s3.py ├── requirements-dev.txt ├── requirements.txt ├── source.bat ├── src │ ├── main │ │ └── python │ │ │ └── IcebergTransformer │ │ │ └── firehose_to_iceberg_transformer.py │ └── utils │ │ └── gen_fake_data.py └── web-analytics-datafirehose-iceberg-arch.svg ├── web-analytics-iceberg ├── .example.cdk.context.json ├── .gitignore ├── README.md ├── app.py ├── assets │ ├── amazon-athena-switching-to-workgroup.png │ ├── wa-iceberg-data-level-01.png │ ├── wa-iceberg-data-level-02.png │ ├── wa-iceberg-data-level-03.png │ └── wa-iceberg-table.png ├── build-aws-lambda-layer-package.sh ├── cdk.json ├── cdk_stacks │ ├── __init__.py │ ├── apigw.py │ ├── firehose_data_proc_lambda.py │ ├── firehose_role.py │ ├── firehose_to_iceberg.py │ ├── kds.py │ ├── lake_formation.py │ └── s3.py ├── requirements-dev.txt ├── requirements.txt ├── source.bat ├── src │ ├── main │ │ └── python │ │ │ └── IcebergTransformer │ │ │ └── firehose_to_iceberg_transformer.py │ └── utils │ │ ├── gen_fake_data.py │ │ └── kds_consumer.py └── web-analytics-iceberg-arch.svg └── web-analytics-parquet ├── .example.cdk.context.json ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── app.py ├── assets ├── amazon-athena-switching-to-workgroup.png └── data-lake-formation-permissions.png ├── build-aws-lambda-layer-package.sh ├── cdk.json ├── cdk_stacks ├── __init__.py ├── apigw.py ├── athena_named_query.py ├── athena_workgroup.py ├── firehose.py ├── firehose_dtata_transform_lambda.py ├── glue_catalog_database.py ├── kds.py ├── lake_formation.py └── merge_small_files_lambda.py ├── requirements-dev.txt ├── requirements.txt ├── source.bat ├── src ├── main │ └── python │ │ ├── MergeSmallFiles │ │ └── athena_ctas.py │ │ └── SchemaValidator │ │ └── schema_validator.py └── utils │ ├── gen_fake_data.py │ └── kds_consumer.py └── web-analytics-arch.svg /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | package-lock.json 4 | __pycache__ 5 | .pytest_cache 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web Log Analytics with Amazon Kinesis Data Streams Proxy using Amazon API Gateway 3 | 4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system.
5 | Below diagram shows what we are implementing. 6 | 7 | | Example | Architecture Diagram | Tags | 8 | |---------|----------------------|------| 9 | | [Web Log Analytics System with Parquet data format](./web-analytics-parquet) | ![](./web-analytics-parquet/web-analytics-arch.svg) | Amazon API Gateway, Amazon Kinesis Data Streams, Amazon Data Firehose, Amazon S3 + Parquet, Amazon Athena, AWS Lambda, Amazon Event Bridge | 10 | | [Web Log Analytics System with Apache Iceberg table](./web-analytics-iceberg) | ![](./web-analytics-iceberg/web-analytics-iceberg-arch.svg) | Amazon API Gateway, Amazon Kinesis Data Streams, Amazon Data Firehose, Amazon S3 + Apache Iceberg, Amazon Athena, AWS Lambda | 11 | | [Web Log Analytics System using API Gateway integrated with Data Firehose with Apache Iceberg table](./web-analytics-datafirehose-iceberg) | ![](./web-analytics-datafirehose-iceberg/web-analytics-datafirehose-iceberg-arch.svg) | Amazon API Gateway, Amazon Data Firehose, Amazon S3 + Apache Iceberg, Amazon Athena, AWS Lambda | 12 | 13 | ## Security 14 | 15 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 16 | 17 | ## License 18 | 19 | This library is licensed under the MIT-0 License. See the LICENSE file. 20 | 21 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/.example.cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "acknowledged-issue-numbers": [ 3 | 32775 4 | ], 5 | "firehose_data_tranform_lambda": { 6 | "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources", 7 | "s3_object_key": "var/fastavro-lib-1.10.0-py-3.11.zip" 8 | }, 9 | "data_firehose_configuration": { 10 | "stream_name": "PUT-Firehose-aEhWz", 11 | "buffering_hints": { 12 | "interval_in_seconds": 60, 13 | "size_in_mbs": 128 14 | }, 15 | "transform_records_with_aws_lambda": { 16 | "buffer_size": 3, 17 | "buffer_interval": 300, 18 | "number_of_retries": 3 19 | }, 20 | "destination_iceberg_table_configuration": { 21 | "database_name": "web_log_iceberg_db", 22 | "table_name": "web_log_iceberg", 23 | "unique_keys": [ 24 | "user_id", "timestamp" 25 | ] 26 | }, 27 | "output_prefix": "web_log_iceberg_db/web_log_iceberg", 28 | "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | package-lock.json 4 | __pycache__ 5 | .pytest_cache 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web Log Analytics with Amazon Data Firehose Proxy using Amazon API Gateway 3 | 4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system. 5 | 6 | Below diagram shows what we are implementing. 7 | 8 | ![web-analytics-arch](web-analytics-datafirehose-iceberg-arch.svg) 9 | 10 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 11 | 12 | This project is set up like a standard Python project. The initialization 13 | process also creates a virtualenv within this project, stored under the `.venv` 14 | directory. To create the virtualenv it assumes that there is a `python3` 15 | (or `python` for Windows) executable in your path with access to the `venv` 16 | package. If for any reason the automatic creation of the virtualenv fails, 17 | you can create the virtualenv manually. 18 | 19 | To manually create a virtualenv on MacOS and Linux: 20 | 21 | ``` 22 | $ python3 -m venv .venv 23 | ``` 24 | 25 | After the init process completes and the virtualenv is created, you can use the following 26 | step to activate your virtualenv. 27 | 28 | ``` 29 | $ source .venv/bin/activate 30 | ``` 31 | 32 | If you are a Windows platform, you would activate the virtualenv like this: 33 | 34 | ``` 35 | % .venv\Scripts\activate.bat 36 | ``` 37 | 38 | Once the virtualenv is activated, you can install the required dependencies. 39 | 40 | ``` 41 | (.venv) $ pip install -r requirements.txt 42 | ``` 43 | 44 | To add additional dependencies, for example other CDK libraries, just add 45 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 46 | command. 47 | 48 | ### Upload Lambda Layer code 49 | 50 | Before deployment, you should uplad zipped code files to s3 like this: 51 |
 52 | (.venv) $ aws s3api create-bucket --bucket your-s3-bucket-name-for-lambda-layer-code --region region-name
 53 | (.venv) $ ./build-aws-lambda-layer-package.sh your-s3-bucket-name-for-lambda-layer-code
 54 | 
55 | 56 | > :warning: To create a bucket outside of the `us-east-1` region, `aws s3api create-bucket` command requires the appropriate **LocationConstraint** to be specified in order to create the bucket in the desired region. For more information, see these [examples](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3api/create-bucket.html#examples). 57 | 58 | > :warning: Make sure you have **Docker** installed. 59 | 60 | For example, 61 |
 62 | (.venv) $ aws s3api create-bucket --bucket lambda-layer-resources --region us-east-1
 63 | (.venv) $ ./build-aws-lambda-layer-package.sh lambda-layer-resources
 64 | 
65 | 66 | For more information about how to create a package for Amazon Lambda Layer, see [here](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/). 67 | 68 | ### Deploy 69 | 70 | Before to synthesize the CloudFormation template for this code, you should update `cdk.context.json` file.
71 | In particular, you need to fill the s3 location of the previously created lambda lay codes. 72 | 73 | For example, 74 |
 75 | {
 76 |   "firehose_data_tranform_lambda": {
 77 |     "s3_bucket_name": "lambda-layer-resources",
 78 |     "s3_object_key": "var/fastavro-lib.zip"
 79 |   },
 80 |   "data_firehose_configuration": {
 81 |     "stream_name": "PUT-Firehose-aEhWz",
 82 |     "buffering_hints": {
 83 |       "interval_in_seconds": 60,
 84 |       "size_in_mbs": 128
 85 |     },
 86 |     "transform_records_with_aws_lambda": {
 87 |       "buffer_size": 3,
 88 |       "buffer_interval": 300,
 89 |       "number_of_retries": 3
 90 |     },
 91 |     "destination_iceberg_table_configuration": {
 92 |       "database_name": "web_log_iceberg_db",
 93 |       "table_name": "web_log_iceberg"
 94 |     },
 95 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
 96 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
 97 |   }
 98 | }
 99 | 
100 | :information_source: `database_name`, and `table_name` of `data_firehose_configuration.destination_iceberg_table_configuration` is used in [**Set up Delivery Stream**](#set-up-delivery-stream) step. 101 | 102 | :information_source: When updating or deleting records in an Iceberg table, specify the table's primary key column name as `unique_keys` in the `data_firehose_configuration.destination_iceberg_table_configuration` settings. 103 | For example, 104 |
105 | "destination_iceberg_table_configuration": {
106 |   "database_name": "web_log_iceberg_db",
107 |   "table_name": "web_log_iceberg",
108 |   "unique_keys": [
109 |     "user_id", "timestamp"
110 |   ]
111 | }
112 | 
113 | 114 | 115 | Now you are ready to synthesize the CloudFormation template for this code.
116 | 117 |
118 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
119 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
120 | (.venv) $ cdk synth --all
121 | 
122 | 123 | Now let's try to deploy. 124 | 125 | ## List all CDK Stacks 126 | 127 | ``` 128 | (.venv) $ cdk list 129 | WebAnalyticsDataFirehoseProxyApiGw 130 | WebAnalyticsDataFirehoseToIcebergS3Path 131 | WebAnalyticsFirehoseDataTransformLambdaStack 132 | WebAnalyticsFirehoseToIcebergRoleStack 133 | WebAnalyticsGrantLFPermissionsOnFirehoseRole 134 | WebAnalyticsFirehoseToIcebergStack 135 | ``` 136 | 137 | Use `cdk deploy` command to create the stack shown above. 138 | 139 | ## Create API endpoint for web data collection 140 | 141 |
142 | (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseProxyApiGw
143 | 
144 | 145 | ## Set up Delivery Stream 146 | 147 | 1. Create a S3 bucket for Apache Iceberg table 148 |
149 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseToIcebergS3Path
150 |    
151 | 2. Create a table with partitioned data in Amazon Athena 152 | 153 | Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
154 | * (step 1) Create a database 155 | 156 | In order to create a new database called `web_log_iceberg_db`, enter the following statement in the Athena query editor and click the **Run** button to execute the query. 157 | 158 |
159 |       CREATE DATABASE IF NOT EXISTS web_log_iceberg_db;
160 |       
161 | 162 | * (step 2) Create a table 163 | 164 | Copy the following query into the Athena query editor. 165 | 166 | Update `LOCATION` to your S3 bucket name and execute the query to create a new table. 167 |
168 |       CREATE TABLE web_log_iceberg_db.web_log_iceberg (
169 |         `user_id` string,
170 |         `session_id` string,
171 |         `event` string,
172 |         `referrer` string,
173 |         `user_agent` string,
174 |         `ip` string,
175 |         `hostname` string,
176 |         `os` string,
177 |         `timestamp` timestamp,
178 |         `uri` string
179 |       )
180 |       PARTITIONED BY (event)
181 |       LOCATION 's3://web-analytics-{region}-{account_id}/web_log_iceberg_db/web_log_iceberg'
182 |       TBLPROPERTIES (
183 |         'table_type'='iceberg',
184 |         'format'='parquet',
185 |         'write_compression'='snappy',
186 |         'optimize_rewrite_delete_file_threshold'='10'
187 |       );
188 |       
189 | If the query is successful, a table named `web_log_iceberg` is created and displayed on the left panel under the **Tables** section. 190 | 191 | If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have `web_log_iceberg_db` selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**. 192 | 3. Create a lambda function to process the streaming data. 193 |
194 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseDataTransformLambdaStack
195 |    
196 | 4. To allow Data Firehose to ingest data into the Apache Iceberg table, create an IAM role and grant permissions to the role. 197 |
198 |    (.venv) $ cdk deploy --require-approval never \
199 |                  WebAnalyticsFirehoseToIcebergRoleStack \
200 |                  WebAnalyticsGrantLFPermissionsOnFirehoseRole
201 |    
202 | 203 | :information_source: If you fail to create the table, give Athena users access permissions on `web_log_iceberg_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant Amazon Data Firehose to access `web_log_iceberg_db` by running the following command: 204 |
205 |    (.venv) $ aws lakeformation grant-permissions \
206 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:role/role-id \
207 |                  --permissions CREATE_TABLE DESCRIBE ALTER DROP \
208 |                  --resource '{ "Database": { "Name": "web_log_iceberg_db" } }'
209 |    (.venv) $ aws lakeformation grant-permissions \
210 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:role/role-id \
211 |                  --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
212 |                  --resource '{ "Table": {"DatabaseName": "web_log_iceberg_db", "TableWildcard": {}} }'
213 |    
214 | 5. Deploy Amazon Data Firehose. 215 |
216 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseToIcebergStack
217 |    
218 | 219 | ## Run Test 220 | 221 | 1. Run `GET /streams` method to invoke `ListDeliveryStreams` in Kinesis 222 |
223 |    $ curl -X GET https://your-api-gateway-id.execute-api.us-east-1.amazonaws.com/v1/streams
224 |    
225 | 226 | The response is: 227 |
228 |    {
229 |      "DeliveryStreamNames": [
230 |        "PUT-Firehose-aEhWz"
231 |      ],
232 |      "HasMoreDeliveryStreams": false
233 |    }
234 |    
235 | 236 | 2. Generate test data. 237 |
238 |    (.venv) $ pip install -r requirements-dev.txt
239 |    (.venv) $ python src/utils/gen_fake_data.py \
240 |                  --stream-name PUT-Firehose-aEhWz \
241 |                  --api-url 'https://your-api-gateway-id.execute-api.us-east-1.amazonaws.com/v1' \
242 |                  --api-method records \
243 |                  --max-count 5
244 | 
245 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"NxB5xOO4Y30ppGBZFfpDoREir/dcWwsF1j4NAie9K1N5pqpjZCSkJPM+7I+Wx7gB/H6hS1BUFGLVIQlR/xEsi7WzT6uA/JX4nXndcF7gxhn3UFGEyyFcgDXyjot5lCFJ5UNnhJk8gAeYT0Ghxj3BNTI22hgrfqdDnjo5MoAg8/0us408pDL37EF4DpIkFMAXWdZdwLRcS6cDt0o0XADBV17XwJnilrSv"}]}
246 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"slrDNLj+LPl1BAi6LzUVvUrhICOdnBY48gIG09zDGb/8fJElu3pYyTdfdNk9V+06rHz/ZY9RoV/0+UapEHaDDVqSjeDQZyZx0HeB2UDVP167Iv1DMgDvDIAiVlwcAyEsfUloqtRekM/B4NHEteJvCrPpqeQV8kYqk6EE1yJvJiLhBnyTVEuoVWbW4qiD+djsgijfL4EufK4ahdQN+CYs70HdUTEdQiV0"}]}
247 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"WGUixKjKAE3aXVe3FbhoRGVh1WomWht8/S1lqhUa6IhxN+tskX5xxO3PjsukPSDDMd9J5LwfzwSh7tt9PQMaqh2r6JDTvP3X3wFItGGrhqY6UD52zs/Z9WINpa1HWcl677xk/qec61gvD5QOpTXWmfG2Q/uWwuboIHoKIqigxeqMpsRpPH40TA6m0HF9AJVrZ5a2VI+OhEK9V/5VkaTI5aQ+Gltl/TSj"}]}
248 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"sOiJXXhDffAuoLaOm7E3y/8GIb9bwVbqrUcfotKT4H2iVQs3sPO1BxVwuaCMpfL8sQwpL4TSg5Y3EfLOzjrGlEOa4D14a3GAuffMQSEBVlwuJDED4JcFHJ/ltekVK/pMyejbBjyVk4e+S1oFK1LaXiGrcrVJ6XzJBk/NDnRLxGLYy+takFZMfyaStcZxXonnmdqw8YwWGgGnsbwj2nGVkR9PBWdyh41l"}]}
249 |    
250 | 251 | 3. Check streaming data in S3 252 | 253 | After `5~10` minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3**. 254 | 255 | ![iceberg-table](./assets/wa-iceberg-table.png) 256 | ![iceberg-table-data-level-01](./assets/wa-iceberg-data-level-01.png) 257 | ![iceberg-table-data-level-02](./assets/wa-iceberg-data-level-02.png) 258 | ![iceberg-table-data-level-03](./assets/wa-iceberg-data-level-03.png) 259 | 260 | 4. Run test query using Amazon Athena 261 | 262 | Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console. 263 | 264 | * (Step 1) Specify the workgroup to use 265 | 266 | To run queries, switch to the appropriate workgroup like this: 267 | ![amazon-athena-switching-to-workgroup](./assets/amazon-athena-switching-to-workgroup.png) 268 | 269 | * (Step 2) Run test query 270 | 271 | Enter the following SQL statement and execute the query. 272 |
273 |      SELECT COUNT(*)
274 |      FROM web_log_iceberg_db.web_log_iceberg;
275 |      
276 | 277 | ## Clean Up 278 | 279 | Delete the CloudFormation stack by running the below command. 280 |
281 | (.venv) $ cdk destroy --force --all
282 | 
283 | 284 | 285 | ## Useful commands 286 | 287 | * `cdk ls` list all stacks in the app 288 | * `cdk synth` emits the synthesized CloudFormation template 289 | * `cdk deploy` deploy this stack to your default AWS account/region 290 | * `cdk diff` compare deployed stack with current state 291 | * `cdk docs` open CDK documentation 292 | 293 | Enjoy! 294 | 295 | ## References 296 | 297 | * [Web Analytics](https://en.wikipedia.org/wiki/Web_analytics) 298 | * [Tutorial: Create a REST API as an Amazon Kinesis proxy in API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html) 299 | * [Streaming Data Solution for Amazon Kinesis](https://aws.amazon.com/ko/solutions/implementations/aws-streaming-data-solution-for-amazon-kinesis/) 300 |
301 | 302 |
303 | * [(AWS Developer Guide) Deliver data to Apache Iceberg Tables with Amazon Data Firehose](https://docs.aws.amazon.com/firehose/latest/dev/apache-iceberg-destination.html) 304 | * [Building fine-grained authorization using Amazon Cognito, API Gateway, and IAM](https://aws.amazon.com/ko/blogs/security/building-fine-grained-authorization-using-amazon-cognito-api-gateway-and-iam/) 305 | * [AWS Lake Formation - Create a data lake administrator](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin) 306 | * [AWS Lake Formation Permissions Reference](https://docs.aws.amazon.com/lake-formation/latest/dg/lf-permissions-reference.html) 307 | * [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html) 308 | * [Amazon Athena Workshop](https://athena-in-action.workshop.aws/) 309 | * [Curl Cookbook](https://catonmat.net/cookbooks/curl) 310 | * [fastavro](https://fastavro.readthedocs.io/) - Fast read/write of `AVRO` files 311 | * [Apache Avro Specification](https://avro.apache.org/docs/current/spec.html) 312 | * [How to create a Lambda layer using a simulated Lambda environment with Docker](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/) 313 | ``` 314 | $ cat < requirements-Lambda-Layer.txt 315 | > fastavro==1.6.1 316 | > EOF 317 | $ docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install -r requirements-Lambda-Layer.txt -t python/lib/python3.11/site-packages/; exit" 318 | $ zip -r fastavro-lib.zip python > /dev/null 319 | $ aws s3 mb s3://my-bucket-for-lambda-layer-packages 320 | $ aws s3 cp fastavro-lib.zip s3://my-bucket-for-lambda-layer-packages/ 321 | ``` 322 | 323 | ## Security 324 | 325 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 326 | 327 | ## License 328 | 329 | This library is licensed under the MIT-0 License. See the LICENSE file. 330 | 331 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from cdk_stacks import ( 10 | DataFirehoseProxyStack, 11 | FirehoseToIcebergStack, 12 | FirehoseRoleStack, 13 | FirehoseDataProcLambdaStack, 14 | DataLakePermissionsStack, 15 | S3BucketStack, 16 | ) 17 | 18 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), 19 | region=os.getenv('CDK_DEFAULT_REGION')) 20 | 21 | app = cdk.App() 22 | 23 | kds_proxy_apigw = DataFirehoseProxyStack(app, 'WebAnalyticsDataFirehoseProxyApiGw') 24 | 25 | s3_dest_bucket = S3BucketStack(app, 'WebAnalyticsDataFirehoseToIcebergS3Path', 26 | env=AWS_ENV) 27 | 28 | firehose_data_transform_lambda = FirehoseDataProcLambdaStack(app, 29 | 'WebAnalyticsFirehoseDataTransformLambdaStack', 30 | env=AWS_ENV 31 | ) 32 | firehose_data_transform_lambda.add_dependency(s3_dest_bucket) 33 | 34 | firehose_role = FirehoseRoleStack(app, 'WebAnalyticsFirehoseToIcebergRoleStack', 35 | firehose_data_transform_lambda.data_proc_lambda_fn, 36 | s3_dest_bucket.s3_bucket, 37 | env=AWS_ENV 38 | ) 39 | firehose_role.add_dependency(firehose_data_transform_lambda) 40 | 41 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnFirehoseRole', 42 | firehose_role.firehose_role, 43 | env=AWS_ENV 44 | ) 45 | grant_lake_formation_permissions.add_dependency(firehose_role) 46 | 47 | firehose_stack = FirehoseToIcebergStack(app, 'WebAnalyticsFirehoseToIcebergStack', 48 | firehose_data_transform_lambda.data_proc_lambda_fn, 49 | s3_dest_bucket.s3_bucket, 50 | firehose_role.firehose_role, 51 | env=AWS_ENV 52 | ) 53 | firehose_stack.add_dependency(grant_lake_formation_permissions) 54 | 55 | app.synth() 56 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/assets/amazon-athena-switching-to-workgroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/amazon-athena-switching-to-workgroup.png -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-01.png -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-02.png -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-03.png -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/assets/wa-iceberg-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-table.png -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/build-aws-lambda-layer-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash - 2 | 3 | VERSION=1.10.0 4 | PY_VERSION=3.11 5 | LAMBDA_LAYER_NAME=fastavro-lib-${VERSION}-py-${PY_VERSION} 6 | S3_PATH=$1 7 | 8 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install fastavro==${VERSION} -t python/lib/python3.11/site-packages/; exit" 9 | 10 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null 11 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip 12 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip" 13 | 14 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .apigw import DataFirehoseProxyStack 2 | from .firehose_to_iceberg import FirehoseToIcebergStack 3 | from .firehose_role import FirehoseRoleStack 4 | from .firehose_data_proc_lambda import FirehoseDataProcLambdaStack 5 | from .lake_formation import DataLakePermissionsStack 6 | from .s3 import S3BucketStack -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/apigw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import json 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_apigateway, 12 | aws_iam 13 | ) 14 | from constructs import Construct 15 | 16 | 17 | class DataFirehoseProxyStack(Stack): 18 | 19 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 20 | super().__init__(scope, construct_id, **kwargs) 21 | 22 | apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument() 23 | apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 24 | "effect": aws_iam.Effect.ALLOW, 25 | "resources": ["*"], 26 | "actions": [ 27 | "firehose:DescribeDeliveryStream", 28 | "firehose:PutRecord", 29 | "firehose:PutRecordBatch" 30 | ] 31 | })) 32 | 33 | apigw_datafirehose_role = aws_iam.Role(self, "APIGatewayRoleToAccessDataFirehose", 34 | role_name=f"APIGatewayRoleToAccessDataFirehose", 35 | assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'), 36 | inline_policies={ 37 | 'DataFirehoseWriteAccess': apigw_kds_access_role_policy_doc 38 | }, 39 | managed_policies=[ 40 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisFirehoseReadOnlyAccess') 41 | ] 42 | ) 43 | 44 | #XXX: Start to create an API as a Kinesis proxy 45 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy 46 | datafirehose_proxy_api = aws_apigateway.RestApi(self, "DataFirehoseProxyAPI", 47 | rest_api_name="log-collector", 48 | description="An Amazon API Gateway REST API that integrated with an Amazon Data Firehose.", 49 | endpoint_types=[aws_apigateway.EndpointType.REGIONAL], 50 | default_cors_preflight_options={ 51 | "allow_origins": aws_apigateway.Cors.ALL_ORIGINS 52 | }, 53 | deploy=True, 54 | deploy_options=aws_apigateway.StageOptions(stage_name="v1"), 55 | endpoint_export_name="DataFirehoseProxyAPIEndpoint" 56 | ) 57 | 58 | apigw_error_responses = [ 59 | aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"), 60 | aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}") 61 | ] 62 | 63 | #XXX: GET /streams 64 | # List Kinesis streams by using the API Gateway console 65 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams 66 | 67 | streams_resource = datafirehose_proxy_api.root.add_resource("streams") 68 | 69 | list_streams_options = aws_apigateway.IntegrationOptions( 70 | credentials_role=apigw_datafirehose_role, 71 | integration_responses=[ 72 | aws_apigateway.IntegrationResponse( 73 | status_code="200" 74 | ), 75 | *apigw_error_responses 76 | ], 77 | request_templates={ 78 | 'application/json': '{}' 79 | }, 80 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 81 | ) 82 | 83 | list_streams_integration = aws_apigateway.AwsIntegration( 84 | service='firehose', 85 | action='ListDeliveryStreams', 86 | integration_http_method='POST', 87 | options=list_streams_options 88 | ) 89 | 90 | streams_resource.add_method("GET", list_streams_integration, 91 | # Default `authorization_type`: - open access unless `authorizer` is specified 92 | authorization_type=aws_apigateway.AuthorizationType.NONE, 93 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 94 | response_models={ 95 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 96 | } 97 | ), 98 | aws_apigateway.MethodResponse(status_code='400'), 99 | aws_apigateway.MethodResponse(status_code='500') 100 | ]) 101 | 102 | #XXX: GET /streams/{stream-name} 103 | # Describe a stream in Kinesis 104 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream 105 | one_stream_resource = streams_resource.add_resource("{stream-name}") 106 | 107 | describe_stream_options = aws_apigateway.IntegrationOptions( 108 | credentials_role=apigw_datafirehose_role, 109 | integration_responses=[ 110 | aws_apigateway.IntegrationResponse( 111 | status_code="200" 112 | ), 113 | *apigw_error_responses 114 | ], 115 | request_templates={ 116 | 'application/json': json.dumps({ 117 | "DeliveryStreamName": "$input.params('stream-name')" 118 | }, indent=2) 119 | }, 120 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 121 | ) 122 | 123 | describe_stream_integration = aws_apigateway.AwsIntegration( 124 | service='firehose', 125 | action='DescribeDeliveryStream', 126 | integration_http_method='POST', 127 | options=describe_stream_options 128 | ) 129 | 130 | one_stream_resource.add_method("GET", describe_stream_integration, 131 | # Default `authorization_type`: - open access unless `authorizer` is specified 132 | authorization_type=aws_apigateway.AuthorizationType.NONE, 133 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 134 | response_models={ 135 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 136 | } 137 | ), 138 | aws_apigateway.MethodResponse(status_code='400'), 139 | aws_apigateway.MethodResponse(status_code='500') 140 | ]) 141 | 142 | #XXX: PUT /streams/{stream-name}/record 143 | # Put a record into a stream in Kinesis 144 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 145 | record_resource = one_stream_resource.add_resource("record") 146 | 147 | put_record_request_mapping_templates = ''' 148 | { 149 | "DeliveryStreamName": "$input.params('stream-name')", 150 | "Record": { 151 | "Data": "$util.base64Encode($input.json('$.Data'))" 152 | } 153 | } 154 | ''' 155 | 156 | put_record_options = aws_apigateway.IntegrationOptions( 157 | credentials_role=apigw_datafirehose_role, 158 | integration_responses=[ 159 | aws_apigateway.IntegrationResponse( 160 | status_code="200" 161 | ), 162 | *apigw_error_responses 163 | ], 164 | request_templates={ 165 | 'application/json': put_record_request_mapping_templates 166 | }, 167 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 168 | ) 169 | 170 | put_record_integration = aws_apigateway.AwsIntegration( 171 | service='firehose', 172 | action='PutRecord', 173 | integration_http_method='POST', 174 | options=put_record_options 175 | ) 176 | 177 | record_resource.add_method("PUT", put_record_integration, 178 | # Default `authorization_type`: - open access unless `authorizer` is specified 179 | authorization_type=aws_apigateway.AuthorizationType.NONE, 180 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 181 | response_models={ 182 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 183 | } 184 | ), 185 | aws_apigateway.MethodResponse(status_code='400'), 186 | aws_apigateway.MethodResponse(status_code='500') 187 | ]) 188 | 189 | 190 | #XXX: PUT /streams/{stream-name}/records 191 | # Put records into a stream in Kinesis 192 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 193 | records_resource = one_stream_resource.add_resource("records") 194 | 195 | put_records_request_mapping_templates = ''' 196 | { 197 | "DeliveryStreamName": "$input.params('stream-name')", 198 | "Records": [ 199 | #foreach($elem in $input.path('$.records')) 200 | { 201 | "Data": "$util.base64Encode($elem.data)" 202 | }#if($foreach.hasNext),#end 203 | #end 204 | ] 205 | } 206 | ''' 207 | 208 | put_records_options = aws_apigateway.IntegrationOptions( 209 | credentials_role=apigw_datafirehose_role, 210 | integration_responses=[ 211 | aws_apigateway.IntegrationResponse( 212 | status_code="200" 213 | ), 214 | *apigw_error_responses 215 | ], 216 | request_templates={ 217 | 'application/json': put_records_request_mapping_templates 218 | }, 219 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 220 | ) 221 | 222 | put_records_integration = aws_apigateway.AwsIntegration( 223 | service='firehose', 224 | action='PutRecordBatch', 225 | integration_http_method='POST', 226 | options=put_records_options 227 | ) 228 | 229 | records_resource.add_method("PUT", put_records_integration, 230 | # Default `authorization_type`: - open access unless `authorizer` is specified 231 | authorization_type=aws_apigateway.AuthorizationType.NONE, 232 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 233 | response_models={ 234 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 235 | } 236 | ), 237 | aws_apigateway.MethodResponse(status_code='400'), 238 | aws_apigateway.MethodResponse(status_code='500') 239 | ]) 240 | 241 | 242 | cdk.CfnOutput(self, 'RestApiEndpointUrl', 243 | value=datafirehose_proxy_api.url, 244 | export_name=f'{self.stack_name}-RestApiEndpointUrl') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/firehose_data_proc_lambda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_lambda, 12 | aws_logs, 13 | aws_s3 as s3 14 | ) 15 | from constructs import Construct 16 | 17 | 18 | class FirehoseDataProcLambdaStack(Stack): 19 | 20 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 21 | super().__init__(scope, construct_id, **kwargs) 22 | 23 | firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda') 24 | LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name'] 25 | LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key'] 26 | 27 | s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET) 28 | lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib", 29 | layer_version_name="fastavro-lib", 30 | compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11], 31 | code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY) 32 | ) 33 | 34 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 35 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 36 | dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None) 37 | dest_iceberg_table_unique_keys = ",".join(dest_iceberg_table_unique_keys) if dest_iceberg_table_unique_keys else "" 38 | 39 | LAMBDA_FN_NAME = "WebAnalyticsFirehoseToIcebergTransformer" 40 | self.data_proc_lambda_fn = aws_lambda.Function(self, "FirehoseToIcebergTransformer", 41 | runtime=aws_lambda.Runtime.PYTHON_3_11, 42 | function_name=LAMBDA_FN_NAME, 43 | handler="firehose_to_iceberg_transformer.lambda_handler", 44 | description="Transform records to Apache Iceberg table", 45 | code=aws_lambda.Code.from_asset(os.path.join(os.path.dirname(__file__), '../src/main/python/IcebergTransformer')), 46 | environment={ 47 | "IcebergDatabaseName": dest_iceberg_table_config["database_name"], 48 | "IcebergTableName": dest_iceberg_table_config["table_name"], 49 | "IcebergTableUniqueKeys": dest_iceberg_table_unique_keys 50 | }, 51 | timeout=cdk.Duration.minutes(5), 52 | #XXX: set memory size appropriately 53 | memory_size=256, 54 | layers=[lambda_lib_layer] 55 | ) 56 | 57 | log_group = aws_logs.LogGroup(self, "FirehoseToIcebergTransformerLogGroup", 58 | #XXX: Circular dependency between resources occurs 59 | # if aws_lambda.Function.function_name is used 60 | # instead of literal name of lambda function such as "FirehoseToIcebergTransformer" 61 | log_group_name=f"/aws/lambda/{LAMBDA_FN_NAME}", 62 | retention=aws_logs.RetentionDays.THREE_DAYS, 63 | removal_policy=cdk.RemovalPolicy.DESTROY 64 | ) 65 | log_group.grant_write(self.data_proc_lambda_fn) 66 | 67 | 68 | cdk.CfnOutput(self, 'FirehoseDataProcFuncName', 69 | value=self.data_proc_lambda_fn.function_name, 70 | export_name=f'{self.stack_name}-FirehoseDataProcFuncName') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/firehose_role.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_iam 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class FirehoseRoleStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, 17 | data_transform_lambda_fn, s3_bucket, **kwargs) -> None: 18 | 19 | super().__init__(scope, construct_id, **kwargs) 20 | 21 | firehose_role_policy_doc = aws_iam.PolicyDocument() 22 | 23 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 24 | "effect": aws_iam.Effect.ALLOW, 25 | "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)], 26 | "actions": [ 27 | "s3:AbortMultipartUpload", 28 | "s3:GetBucketLocation", 29 | "s3:GetObject", 30 | "s3:ListBucket", 31 | "s3:ListBucketMultipartUploads", 32 | "s3:PutObject", 33 | "s3:DeleteObject" 34 | ] 35 | })) 36 | 37 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 38 | "effect": aws_iam.Effect.ALLOW, 39 | "resources": [ 40 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog", 41 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:database/*", 42 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:table/*/*" 43 | ], 44 | "actions": [ 45 | "glue:GetTable", 46 | "glue:GetDatabase", 47 | "glue:UpdateTable" 48 | ] 49 | })) 50 | 51 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 52 | effect=aws_iam.Effect.ALLOW, 53 | resources=["*"], 54 | actions=[ 55 | "ec2:DescribeVpcs", 56 | "ec2:DescribeVpcAttribute", 57 | "ec2:DescribeSubnets", 58 | "ec2:DescribeSecurityGroups", 59 | "ec2:DescribeNetworkInterfaces", 60 | "ec2:CreateNetworkInterface", 61 | "ec2:CreateNetworkInterfacePermission", 62 | "ec2:DeleteNetworkInterface" 63 | ] 64 | )) 65 | 66 | #XXX: https://docs.aws.amazon.com/ko_kr/cdk/latest/guide/tokens.html 67 | # String-encoded tokens: 68 | # Avoid manipulating the string in other ways. For example, 69 | # taking a substring of a string is likely to break the string token. 70 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 71 | firehose_stream_name = data_firehose_configuration['stream_name'] 72 | 73 | firehose_log_group_name = f"/aws/kinesisfirehose/{firehose_stream_name}" 74 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 75 | effect=aws_iam.Effect.ALLOW, 76 | #XXX: The ARN will be formatted as follows: 77 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name} 78 | resources=[self.format_arn(service="logs", resource="log-group", 79 | resource_name="{}:log-stream:*".format(firehose_log_group_name), 80 | arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)], 81 | actions=["logs:PutLogEvents"] 82 | )) 83 | 84 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 85 | "effect": aws_iam.Effect.ALLOW, 86 | "resources": [f"{data_transform_lambda_fn.function_arn}:*"], 87 | "actions": [ 88 | "lambda:InvokeFunction", 89 | "lambda:GetFunctionConfiguration" 90 | ] 91 | })) 92 | 93 | self.firehose_role = aws_iam.Role(self, "KinesisFirehoseServiceRole", 94 | role_name=f"KinesisFirehoseServiceRole-{firehose_stream_name}-{self.region}", 95 | assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"), 96 | #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221 97 | inline_policies={ 98 | "firehose_role_policy": firehose_role_policy_doc 99 | } 100 | ) 101 | 102 | 103 | cdk.CfnOutput(self, 'FirehoseRole', 104 | value=self.firehose_role.role_name, 105 | export_name=f'{self.stack_name}-Role') 106 | cdk.CfnOutput(self, 'FirehoseRoleArn', 107 | value=self.firehose_role.role_arn, 108 | export_name=f'{self.stack_name}-RoleArn') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/firehose_to_iceberg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_kinesisfirehose 10 | ) 11 | from constructs import Construct 12 | 13 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn_delivery_stream 14 | 15 | 16 | class FirehoseToIcebergStack(Stack): 17 | 18 | def __init__(self, scope: Construct, construct_id: str, 19 | data_transform_lambda_fn, s3_bucket, 20 | firehose_role, **kwargs) -> None: 21 | 22 | super().__init__(scope, construct_id, **kwargs) 23 | 24 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 25 | 26 | delivery_stream_name = data_firehose_configuration['stream_name'] 27 | 28 | firehose_log_group_name = f"/aws/kinesisfirehose/{delivery_stream_name}" 29 | 30 | firehose_buffering_hints = data_firehose_configuration["buffering_hints"] 31 | firehose_buffer_size = firehose_buffering_hints["size_in_mbs"] 32 | firehose_buffer_interval = firehose_buffering_hints["interval_in_seconds"] 33 | 34 | transform_records_with_aws_lambda = data_firehose_configuration["transform_records_with_aws_lambda"] 35 | firehose_lambda_buffer_size = transform_records_with_aws_lambda["buffer_size"] 36 | firehose_lambda_buffer_interval = transform_records_with_aws_lambda["buffer_interval"] 37 | firehose_lambda_number_of_retries = transform_records_with_aws_lambda["number_of_retries"] 38 | 39 | s3_output_prefix = data_firehose_configuration["output_prefix"] 40 | s3_error_output_prefix = data_firehose_configuration["error_output_prefix"] 41 | 42 | lambda_proc = cfn_delivery_stream.ProcessorProperty( 43 | type="Lambda", 44 | parameters=[ 45 | cfn_delivery_stream.ProcessorParameterProperty( 46 | parameter_name="LambdaArn", 47 | parameter_value='{}:{}'.format( 48 | data_transform_lambda_fn.function_arn, 49 | data_transform_lambda_fn.latest_version.version 50 | ) 51 | ), 52 | cfn_delivery_stream.ProcessorParameterProperty( 53 | parameter_name="NumberOfRetries", 54 | parameter_value=str(firehose_lambda_number_of_retries) 55 | ), 56 | cfn_delivery_stream.ProcessorParameterProperty( 57 | parameter_name="RoleArn", 58 | parameter_value=firehose_role.role_arn 59 | ), 60 | cfn_delivery_stream.ProcessorParameterProperty( 61 | parameter_name="BufferSizeInMBs", 62 | parameter_value=str(firehose_lambda_buffer_size) 63 | ), 64 | cfn_delivery_stream.ProcessorParameterProperty( 65 | parameter_name="BufferIntervalInSeconds", 66 | parameter_value=str(firehose_lambda_buffer_interval) 67 | ) 68 | ] 69 | ) 70 | 71 | firehose_processing_config = cfn_delivery_stream.ProcessingConfigurationProperty( 72 | enabled=True, 73 | processors=[ 74 | lambda_proc 75 | ] 76 | ) 77 | 78 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 79 | dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None) 80 | dest_iceberg_table_unique_keys = dest_iceberg_table_unique_keys if dest_iceberg_table_unique_keys else None 81 | 82 | iceberg_dest_config = cfn_delivery_stream.IcebergDestinationConfigurationProperty( 83 | catalog_configuration=cfn_delivery_stream.CatalogConfigurationProperty( 84 | catalog_arn=f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog" 85 | ), 86 | role_arn=firehose_role.role_arn, 87 | s3_configuration=cfn_delivery_stream.S3DestinationConfigurationProperty( 88 | bucket_arn=s3_bucket.bucket_arn, 89 | role_arn=firehose_role.role_arn, 90 | buffering_hints={ 91 | "intervalInSeconds": firehose_buffer_interval, 92 | "sizeInMBs": firehose_buffer_size 93 | }, 94 | cloud_watch_logging_options={ 95 | "enabled": True, 96 | "logGroupName": firehose_log_group_name, 97 | "logStreamName": "DestinationDelivery" 98 | }, 99 | compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP] 100 | error_output_prefix=s3_error_output_prefix, 101 | prefix=s3_output_prefix, 102 | ), 103 | buffering_hints={ 104 | "intervalInSeconds": firehose_buffer_interval, 105 | "sizeInMBs": firehose_buffer_size 106 | }, 107 | cloud_watch_logging_options={ 108 | "enabled": True, 109 | "logGroupName": firehose_log_group_name, 110 | "logStreamName": "DestinationDelivery" 111 | }, 112 | destination_table_configuration_list=[ 113 | cfn_delivery_stream.DestinationTableConfigurationProperty( 114 | destination_database_name=dest_iceberg_table_config["database_name"], 115 | destination_table_name=dest_iceberg_table_config["table_name"], 116 | unique_keys=dest_iceberg_table_unique_keys 117 | ) 118 | ], 119 | processing_configuration=firehose_processing_config, 120 | s3_backup_mode='FailedDataOnly' 121 | ) 122 | 123 | delivery_stream = aws_kinesisfirehose.CfnDeliveryStream(self, "FirehoseToIceberg", 124 | delivery_stream_name=delivery_stream_name, 125 | delivery_stream_type="DirectPut", 126 | iceberg_destination_configuration=iceberg_dest_config, 127 | tags=[{"key": "Name", "value": delivery_stream_name}] 128 | ) 129 | 130 | 131 | cdk.CfnOutput(self, 'S3DestBucket', 132 | value=s3_bucket.bucket_name, 133 | export_name=f'{self.stack_name}-S3DestBucket') 134 | cdk.CfnOutput(self, 'DataFirehoseStreamName', 135 | value=delivery_stream.delivery_stream_name, 136 | export_name=f'{self.stack_name}-FirehoseStreamName') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/lake_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_lakeformation 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class DataLakePermissionsStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, firehose_role, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 20 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 21 | database_name=dest_iceberg_table_config["database_name"] 22 | 23 | #XXXX: The role assumed by cdk is not a data lake administrator. 24 | # So, deploying PrincipalPermissions meets the error such as: 25 | # "Resource does not exist or requester is not authorized to access requested permissions." 26 | # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator. 27 | # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68 28 | cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings", 29 | admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( 30 | data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn) 31 | )] 32 | ) 33 | 34 | cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions", 35 | permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"], 36 | permissions_with_grant_option=[], 37 | principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty( 38 | data_lake_principal_identifier=firehose_role.role_arn 39 | ), 40 | resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty( 41 | #XXX: Can't specify a TableWithColumns resource and a Table resource 42 | table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty( 43 | catalog_id=cdk.Aws.ACCOUNT_ID, 44 | database_name=database_name, 45 | # name="ALL_TABLES", 46 | table_wildcard={} 47 | ) 48 | ) 49 | ) 50 | cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 51 | 52 | #XXX: In order to keep resource destruction order, 53 | # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions 54 | cfn_principal_permissions.add_dependency(cfn_data_lake_settings) 55 | 56 | 57 | cdk.CfnOutput(self, 'Principal', 58 | value=cfn_principal_permissions.attr_principal_identifier, 59 | export_name=f'{self.stack_name}-Principal') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/cdk_stacks/s3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_s3 as s3 10 | ) 11 | 12 | from constructs import Construct 13 | 14 | 15 | class S3BucketStack(Stack): 16 | 17 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 18 | super().__init__(scope, construct_id, **kwargs) 19 | 20 | S3_DEFAULT_BUCKET_NAME = f"web-analytics-{self.region}-{self.account}" 21 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 22 | s3_bucket_name = data_firehose_configuration.get('s3_bucket_name', S3_DEFAULT_BUCKET_NAME) 23 | 24 | self.s3_bucket = s3.Bucket(self, "s3bucket", 25 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned 26 | bucket_name=s3_bucket_name) 27 | 28 | 29 | cdk.CfnOutput(self, 'S3BucketName', 30 | value=self.s3_bucket.bucket_name, 31 | export_name=f'{self.stack_name}-S3BucketName') -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.24.41 2 | mimesis==18.0.0 3 | requests>=2.31.0 4 | 5 | # packages for Lambda Layer 6 | fastavro==1.10.0 7 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.185.0 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/src/main/python/IcebergTransformer/firehose_to_iceberg_transformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import base64 6 | import collections 7 | import json 8 | import logging 9 | import os 10 | from datetime import datetime 11 | 12 | import fastavro 13 | 14 | 15 | LOGGER = logging.getLogger() 16 | if len(LOGGER.handlers) > 0: 17 | # The Lambda environment pre-configures a handler logging to stderr. 18 | # If a handler is already configured, `.basicConfig` does not execute. 19 | # Thus we set the level directly. 20 | LOGGER.setLevel(logging.INFO) 21 | else: 22 | logging.basicConfig(level=logging.INFO) 23 | 24 | 25 | DESTINATION_DATABASE_NAME = os.environ['IcebergDatabaseName'] 26 | DESTINATION_TABLE_NAME = os.environ['IcebergTableName'] 27 | DESTINATION_TABLE_UNIQUE_KEYS = os.environ.get('IcebergTableUniqueKeys', None) 28 | 29 | ORIGINAL_SCHEMA = { 30 | 'name': 'WebLogs', 31 | 'type': 'record', 32 | 'fields': [ 33 | { 34 | 'name': 'user_id', 35 | 'type': 'string' 36 | }, 37 | { 38 | 'name': 'session_id', 39 | 'type': 'string' 40 | }, 41 | { 42 | 'name': 'event', 43 | 'type': 'string' 44 | }, 45 | { 46 | 'name': 'referrer', 47 | 'type': ['string', 'null'] 48 | }, 49 | { 50 | 'name': 'user_agent', 51 | 'type': ['string', 'null'] 52 | }, 53 | { 54 | 'name': 'ip', 55 | 'type': 'string' 56 | }, 57 | { 58 | 'name': 'hostname', 59 | 'type': 'string' 60 | }, 61 | { 62 | 'name': 'os', 63 | 'type': ['string', 'null'] 64 | }, 65 | { 66 | 'name': 'timestamp', 67 | 'type': { 68 | 'type': 'string', 69 | 'logicalType': 'datetime' 70 | } 71 | }, 72 | { 73 | 'name': 'uri', 74 | 'type': 'string' 75 | } 76 | ] 77 | } 78 | 79 | 80 | def read_datetime(data, writer_schema=None, reader_schema=None): 81 | return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 82 | 83 | 84 | def prepare_datetime(data, schema): 85 | """Converts datetime.datetime to string representing the date and time""" 86 | if isinstance(data, datetime): 87 | return datetime.strftime('%Y-%m-%dT%H:%M:%SZ') 88 | else: 89 | try: 90 | dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 91 | return dt.strftime('%Y-%m-%dT%H:%M:%SZ') 92 | except Exception as ex: 93 | return None 94 | 95 | 96 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime 97 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime 98 | 99 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA) 100 | 101 | 102 | def check_schema(record): 103 | try: 104 | return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False) 105 | except Exception as ex: 106 | LOGGER.error(ex) 107 | return False 108 | 109 | 110 | def lambda_handler(event, context): 111 | counter = collections.Counter(total=0, valid=0, invalid=0) 112 | firehose_records_output = {'records': []} 113 | 114 | unique_keys_exist = True if DESTINATION_TABLE_UNIQUE_KEYS else False 115 | otf_metadata_operation = 'insert' if not unique_keys_exist else 'update' 116 | 117 | for record in event['records']: 118 | counter['total'] += 1 119 | 120 | payload = base64.b64decode(record['data']).decode('utf-8') 121 | json_value = json.loads(payload) 122 | 123 | #XXX: check if schema is valid 124 | is_valid = check_schema(json_value) 125 | counter['valid' if is_valid else 'invalid'] += 1 126 | 127 | firehose_record = { 128 | 'data': base64.b64encode(payload.encode('utf-8')), 129 | 'recordId': record['recordId'], 130 | 'result': 'Ok' if is_valid else 'ProcessingFailed', # [Ok, Dropped, ProcessingFailed] 131 | 'metadata': { 132 | 'otfMetadata': { 133 | 'destinationDatabaseName': DESTINATION_DATABASE_NAME, 134 | 'destinationTableName': DESTINATION_TABLE_NAME, 135 | 'operation': otf_metadata_operation 136 | } 137 | } 138 | } 139 | 140 | firehose_records_output['records'].append(firehose_record) 141 | 142 | LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items())) 143 | 144 | return firehose_records_output 145 | 146 | 147 | if __name__ == '__main__': 148 | import pprint 149 | 150 | record_list = [ 151 | ('Ok', { 152 | "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720", 153 | "session_id": "a5aa20a72c9e37588f9bbeaa", 154 | "event": "view", 155 | "referrer": "brandon.biz", 156 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 157 | "ip": "202.165.71.49", 158 | "hostname": "toxic.tokyo", 159 | "os": "openSUSE", 160 | "timestamp": "2022-09-16T07:35:46Z", 161 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 162 | }), 163 | ('Ok', { 164 | "user_id": "70b1f606-aa63-47fb-bc92-76de9c59d064", 165 | "session_id": "928e78473db8449b17644b2c", 166 | "event": "like", 167 | # missing optional data 168 | # "referrer": "toe.gq", 169 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53", 170 | "ip": "12.166.113.176", 171 | "hostname": "drivers.glass", 172 | "os": "Windows 8.1", 173 | "timestamp": "2022-09-16T07:52:47Z", 174 | "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members" 175 | }), 176 | ('ProcessingFailed', { 177 | "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720", 178 | "session_id": "a5aa20a72c9e37588f9bbeaa", 179 | "event": "cart", 180 | "referrer": "brandon.biz", 181 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 182 | "ip": "202.165.71.49", 183 | "hostname": "toxic.tokyo", 184 | "os": "openSUSE", 185 | # invalid datetime format 186 | "timestamp": "2022-09-16 07:35:46", 187 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 188 | }), 189 | ('ProcessingFailed', { 190 | # missing required data 191 | # "user_id": "045e63c7-b276-4117-9706-7c2e3b87d5f5", 192 | "session_id": "abfd47eb7dd7b8aeec0555a7", 193 | "event": "purchase", 194 | "referrer": "transfer.edu", 195 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50", 196 | "ip": "170.128.148.234", 197 | "hostname": "propecia.tc", 198 | "os": "Lubuntu", 199 | "timestamp": "2022-09-16T07:46:04Z", 200 | "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel" 201 | }), 202 | ('ProcessingFailed', { 203 | "user_id": "e504cd9d-30da-497f-8f28-2b3f64220e16", 204 | "session_id": "fd4807ab825ee8bd950b1e8b", 205 | "event": "list", 206 | "referrer": "liquid.aquitaine", 207 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02", 208 | # mismatched data type 209 | "ip": 212234672, 210 | "hostname": "consequently.com", 211 | "os": "Gentoo", 212 | "timestamp": "2022-09-16T07:13:29Z", 213 | "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook" 214 | }), 215 | ('ProcessingFailed', { 216 | # mismatched column name 217 | "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720", 218 | # mismatched column name 219 | "sessionId": "a5aa20a72c9e37588f9bbeaa", 220 | "event": "visit", 221 | "referrer": "brandon.biz", 222 | # mismatched column name 223 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 224 | "ip": "202.165.71.49", 225 | "hostname": "toxic.tokyo", 226 | "os": "openSUSE", 227 | "timestamp": "2022-09-16T07:35:46Z", 228 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 229 | }) 230 | ] 231 | 232 | for correct_result, record in record_list: 233 | event = { 234 | "invocationId": "invocationIdExample", 235 | "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE", 236 | "region": "us-east-1", 237 | "records": [ 238 | { 239 | "recordId": "49546986683135544286507457936321625675700192471156785154", 240 | "approximateArrivalTimestamp": 1495072949453, 241 | "data": base64.b64encode(json.dumps(record).encode('utf-8')) 242 | } 243 | ] 244 | } 245 | 246 | res = lambda_handler(event, {}) 247 | print(f"\n>> {correct_result} == {res['records'][0]['result']}?", res['records'][0]['result'] == correct_result) 248 | pprint.pprint(res) 249 | -------------------------------------------------------------------------------- /web-analytics-datafirehose-iceberg/src/utils/gen_fake_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import sys 6 | import argparse 7 | from datetime import ( 8 | datetime, 9 | timezone 10 | ) 11 | import json 12 | import time 13 | import typing 14 | 15 | from mimesis.locales import Locale 16 | from mimesis.schema import Field, Schema 17 | from mimesis.providers.base import BaseProvider 18 | import requests 19 | 20 | 21 | class CustomDatetime(BaseProvider): 22 | class Meta: 23 | """Class for metadata.""" 24 | name: typing.Final[str] = "custom_datetime" 25 | 26 | def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: 27 | super().__init__(*args, **kwargs) 28 | 29 | def timestamp(self) -> str: 30 | utc_now = datetime.now(timezone.utc) 31 | minute = self.random.randint(0, 59) 32 | second = self.random.randint(0, 59) 33 | random_datetime = utc_now.replace(minute=minute, second=second) 34 | return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ") 35 | 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser() 39 | 40 | parser.add_argument('--api-url', help='log collector api url') 41 | parser.add_argument('--api-method', default='records', choices=['record', 'records'], 42 | help='log collector api method [record | records]') 43 | parser.add_argument('--stream-name', help='kinesis stream name') 44 | parser.add_argument('--max-count', default=15, type=int, help='max number of records to put') 45 | parser.add_argument('--dry-run', action='store_true') 46 | 47 | options = parser.parse_args() 48 | 49 | _field = Field(locale=Locale.EN) 50 | _field._generic.add_provider(CustomDatetime) 51 | 52 | schema_definition = lambda: { 53 | "user_id": _field("uuid"), 54 | "session_id": _field("token_hex", entropy=12), 55 | "event": _field("choice", items=['visit', 'view', 'list', 'like', 'cart', 'purchase']), 56 | "referrer": _field("internet.hostname"), 57 | "user_agent": _field("internet.user_agent"), 58 | "ip": _field("internet.ip_v4"), 59 | "hostname": _field("internet.hostname"), 60 | "os": _field("development.os"), 61 | "timestamp": _field("custom_datetime.timestamp"), 62 | "uri": _field("internet.uri", query_params_count=2) 63 | } 64 | schema = Schema(schema=schema_definition, iterations=options.max_count) 65 | 66 | log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None 67 | 68 | for record in schema: 69 | if options.dry_run: 70 | print(json.dumps(record), file=sys.stderr) 71 | continue 72 | 73 | if options.api_method == 'record': 74 | data = {'Data': record} 75 | payload = f'{json.dumps(data)}' 76 | else: 77 | #XXX: make sure data has newline 78 | data = {"records":[{'data': f'{json.dumps(record)}\n'}]} 79 | payload = json.dumps(data) 80 | 81 | res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'}) 82 | if res.status_code == 200: 83 | print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr) 84 | else: 85 | print(f'[{res.status_code} {res.reason}]', file=sys.stderr) 86 | sys.exit(1) 87 | time.sleep(0.5) 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /web-analytics-iceberg/.example.cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "acknowledged-issue-numbers": [ 3 | 32775 4 | ], 5 | "firehose_data_tranform_lambda": { 6 | "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources", 7 | "s3_object_key": "var/fastavro-lib.zip" 8 | }, 9 | "data_firehose_configuration": { 10 | "buffering_hints": { 11 | "interval_in_seconds": 60, 12 | "size_in_mbs": 128 13 | }, 14 | "transform_records_with_aws_lambda": { 15 | "buffer_size": 3, 16 | "buffer_interval": 300, 17 | "number_of_retries": 3 18 | }, 19 | "destination_iceberg_table_configuration": { 20 | "database_name": "web_log_iceberg_db", 21 | "table_name": "web_log_iceberg", 22 | "unique_keys": [ 23 | "user_id", "timestamp" 24 | ] 25 | }, 26 | "output_prefix": "web_log_iceberg_db/web_log_iceberg", 27 | "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /web-analytics-iceberg/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | package-lock.json 4 | __pycache__ 5 | .pytest_cache 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /web-analytics-iceberg/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web Log Analytics with Amazon Kinesis Data Streams Proxy using Amazon API Gateway 3 | 4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system.
5 | Below diagram shows what we are implementing. 6 | 7 | ![web-analytics-arch](web-analytics-iceberg-arch.svg) 8 | 9 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 10 | 11 | This project is set up like a standard Python project. The initialization 12 | process also creates a virtualenv within this project, stored under the `.venv` 13 | directory. To create the virtualenv it assumes that there is a `python3` 14 | (or `python` for Windows) executable in your path with access to the `venv` 15 | package. If for any reason the automatic creation of the virtualenv fails, 16 | you can create the virtualenv manually. 17 | 18 | To manually create a virtualenv on MacOS and Linux: 19 | 20 | ``` 21 | $ python3 -m venv .venv 22 | ``` 23 | 24 | After the init process completes and the virtualenv is created, you can use the following 25 | step to activate your virtualenv. 26 | 27 | ``` 28 | $ source .venv/bin/activate 29 | ``` 30 | 31 | If you are a Windows platform, you would activate the virtualenv like this: 32 | 33 | ``` 34 | % .venv\Scripts\activate.bat 35 | ``` 36 | 37 | Once the virtualenv is activated, you can install the required dependencies. 38 | 39 | ``` 40 | (.venv) $ pip install -r requirements.txt 41 | ``` 42 | 43 | To add additional dependencies, for example other CDK libraries, just add 44 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 45 | command. 46 | 47 | ### Upload Lambda Layer code 48 | 49 | Before deployment, you should uplad zipped code files to s3 like this: 50 |
 51 | (.venv) $ aws s3api create-bucket --bucket your-s3-bucket-name-for-lambda-layer-code --region region-name
 52 | (.venv) $ ./build-aws-lambda-layer-package.sh your-s3-bucket-name-for-lambda-layer-code
 53 | 
54 | 55 | > :warning: To create a bucket outside of the `us-east-1` region, `aws s3api create-bucket` command requires the appropriate **LocationConstraint** to be specified in order to create the bucket in the desired region. For more information, see these [examples](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3api/create-bucket.html#examples). 56 | 57 | > :warning: Make sure you have **Docker** installed. 58 | 59 | For example, 60 |
 61 | (.venv) $ aws s3api create-bucket --bucket lambda-layer-resources --region us-east-1
 62 | (.venv) $ ./build-aws-lambda-layer-package.sh lambda-layer-resources
 63 | 
64 | 65 | For more information about how to create a package for Amazon Lambda Layer, see [here](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/). 66 | 67 | ### Deploy 68 | 69 | Before to synthesize the CloudFormation template for this code, you should update `cdk.context.json` file.
70 | In particular, you need to fill the s3 location of the previously created lambda lay codes. 71 | 72 | For example, 73 |
 74 | {
 75 |   "firehose_data_tranform_lambda": {
 76 |     "s3_bucket_name": "lambda-layer-resources",
 77 |     "s3_object_key": "var/fastavro-lib.zip"
 78 |   },
 79 |   "data_firehose_configuration": {
 80 |     "buffering_hints": {
 81 |       "interval_in_seconds": 60,
 82 |       "size_in_mbs": 128
 83 |     },
 84 |     "transform_records_with_aws_lambda": {
 85 |       "buffer_size": 3,
 86 |       "buffer_interval": 300,
 87 |       "number_of_retries": 3
 88 |     },
 89 |     "destination_iceberg_table_configuration": {
 90 |       "database_name": "web_log_iceberg_db",
 91 |       "table_name": "web_log_iceberg"
 92 |     },
 93 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
 94 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
 95 |   }
 96 | }
 97 | 
98 | :information_source: `database_name`, and `table_name` of `data_firehose_configuration.destination_iceberg_table_configuration` is used in [**Set up Delivery Stream**](#set-up-delivery-stream) step. 99 | 100 | :information_source: When updating or deleting records in an Iceberg table, specify the table's primary key column name as `unique_keys` in the `data_firehose_configuration.destination_iceberg_table_configuration` settings. 101 | For example, 102 |
103 | "destination_iceberg_table_configuration": {
104 |   "database_name": "web_log_iceberg_db",
105 |   "table_name": "web_log_iceberg",
106 |   "unique_keys": [
107 |     "user_id", "timestamp"
108 |   ]
109 | }
110 | 
111 | 112 | 113 | Now you are ready to synthesize the CloudFormation template for this code.
114 | 115 |
116 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
117 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
118 | (.venv) $ cdk synth --all
119 | 
120 | 121 | Now let's try to deploy. 122 | 123 | ## List all CDK Stacks 124 | 125 | ``` 126 | (.venv) $ cdk list 127 | WebAnalyticsKdsProxyApiGw 128 | WebAnalyticsKinesisStream 129 | WebAnalyticsDataFirehoseToIcebergS3Path 130 | WebAnalyticsFirehoseDataTransformLambdaStack 131 | WebAnalyticsFirehoseToIcebergRoleStack 132 | WebAnalyticsGrantLFPermissionsOnFirehoseRole 133 | WebAnalyticsFirehoseToIcebergStack 134 | ``` 135 | 136 | Use `cdk deploy` command to create the stack shown above. 137 | 138 | ## Create API endpoint for web data collection 139 | 140 |
141 | (.venv) $ cdk deploy --require-approval never \
142 |               WebAnalyticsKdsProxyApiGw \
143 |               WebAnalyticsKinesisStream
144 | 
145 | 146 | ## Set up Delivery Stream 147 | 148 | 1. Create a S3 bucket for Apache Iceberg table 149 |
150 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseToIcebergS3Path
151 |    
152 | 2. Create a table with partitioned data in Amazon Athena 153 | 154 | Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
155 | * (step 1) Create a database 156 | 157 | In order to create a new database called `web_log_iceberg_db`, enter the following statement in the Athena query editor and click the **Run** button to execute the query. 158 | 159 |
160 |       CREATE DATABASE IF NOT EXISTS web_log_iceberg_db;
161 |       
162 | 163 | * (step 2) Create a table 164 | 165 | Copy the following query into the Athena query editor. 166 | 167 | Update `LOCATION` to your S3 bucket name and execute the query to create a new table. 168 |
169 |       CREATE TABLE web_log_iceberg_db.web_log_iceberg (
170 |         `user_id` string,
171 |         `session_id` string,
172 |         `event` string,
173 |         `referrer` string,
174 |         `user_agent` string,
175 |         `ip` string,
176 |         `hostname` string,
177 |         `os` string,
178 |         `timestamp` timestamp,
179 |         `uri` string
180 |       )
181 |       PARTITIONED BY (event)
182 |       LOCATION 's3://web-analytics-{region}-{account_id}/web_log_iceberg_db/web_log_iceberg'
183 |       TBLPROPERTIES (
184 |         'table_type'='iceberg',
185 |         'format'='parquet',
186 |         'write_compression'='snappy',
187 |         'optimize_rewrite_delete_file_threshold'='10'
188 |       );
189 |       
190 | If the query is successful, a table named `web_log_iceberg` is created and displayed on the left panel under the **Tables** section. 191 | 192 | If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have `web_log_iceberg_db` selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**. 193 | 3. Create a lambda function to process the streaming data. 194 |
195 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseDataTransformLambdaStack
196 |    
197 | 4. To allow Data Firehose to ingest data into the Apache Iceberg table, create an IAM role and grant permissions to the role. 198 |
199 |    (.venv) $ cdk deploy --require-approval never \
200 |                  WebAnalyticsFirehoseToIcebergRoleStack \
201 |                  WebAnalyticsGrantLFPermissionsOnFirehoseRole
202 |    
203 | 204 | :information_source: If you fail to create the table, give Athena users access permissions on `web_log_iceberg_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant Amazon Data Firehose to access `web_log_iceberg_db` by running the following command: 205 |
206 |    (.venv) $ aws lakeformation grant-permissions \
207 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:role/role-id \
208 |                  --permissions CREATE_TABLE DESCRIBE ALTER DROP \
209 |                  --resource '{ "Database": { "Name": "web_log_iceberg_db" } }'
210 |    (.venv) $ aws lakeformation grant-permissions \
211 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::{account-id}:role/role-id \
212 |                  --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
213 |                  --resource '{ "Table": {"DatabaseName": "web_log_iceberg_db", "TableWildcard": {}} }'
214 |    
215 | 5. Deploy Amazon Data Firehose. 216 |
217 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseToIcebergStack
218 |    
219 | 220 | ## Run Test 221 | 222 | 1. Run `GET /streams` method to invoke `ListStreams` in Kinesis 223 |
224 |    $ curl -X GET https://your-api-gateway-id.execute-api.us-east-1.amazonaws.com/v1/streams
225 |    
226 | 227 | The response is: 228 |
229 |    {
230 |      "HasMoreStreams": false,
231 |      "StreamNames": [
232 |        "PUT-Firehose-aEhWz"
233 |      ],
234 |      "StreamSummaries": [
235 |        {
236 |          "StreamARN": "arn:aws:kinesis:us-east-1:123456789012:stream/PUT-Firehose-aEhWz",
237 |          "StreamCreationTimestamp": 1661612556,
238 |          "StreamModeDetails": {
239 |            "StreamMode": "ON_DEMAND"
240 |          },
241 |          "StreamName": "PUT-Firehose-aEhWz",
242 |          "StreamStatus": "ACTIVE"
243 |        }
244 |      ]
245 |    }
246 |    
247 | 248 | 2. Generate test data. 249 |
250 |    (.venv) $ pip install -r requirements-dev.txt
251 |    (.venv) $ python src/utils/gen_fake_data.py --max-count 5 --stream-name PUT-Firehose-aEhWz --api-url 'https://your-api-gateway-id.execute-api.us-east-1.amazonaws.com/v1' --api-method records
252 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260289903462649185194773668901646666226496176178","ShardId":"shardId-000000000003"}]}
253 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260289903462649185194774877827466280924390359090","ShardId":"shardId-000000000003"}]}
254 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260223001227053593325351479598467950537766600706","ShardId":"shardId-000000000000"}]}
255 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260245301972252123948494224242560213528447287314","ShardId":"shardId-000000000001"}]}
256 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260223001227053593325353897450107179933554966530","ShardId":"shardId-000000000000"}]}
257 |    
258 | 259 | 3. Check streaming data in S3 260 | 261 | After `5~10` minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3**. 262 | 263 | ![iceberg-table](./assets/wa-iceberg-table.png) 264 | ![iceberg-table-data-level-01](./assets/wa-iceberg-data-level-01.png) 265 | ![iceberg-table-data-level-02](./assets/wa-iceberg-data-level-02.png) 266 | ![iceberg-table-data-level-03](./assets/wa-iceberg-data-level-03.png) 267 | 268 | 4. Run test query using Amazon Athena 269 | 270 | Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console. 271 | 272 | * (Step 1) Specify the workgroup to use 273 | 274 | To run queries, switch to the appropriate workgroup like this: 275 | ![amazon-athena-switching-to-workgroup](./assets/amazon-athena-switching-to-workgroup.png) 276 | 277 | * (Step 2) Run test query 278 | 279 | Enter the following SQL statement and execute the query. 280 |
281 |      SELECT COUNT(*)
282 |      FROM web_log_iceberg_db.web_log_iceberg;
283 |      
284 | 285 | ## Clean Up 286 | 287 | Delete the CloudFormation stack by running the below command. 288 |
289 | (.venv) $ cdk destroy --force --all
290 | 
291 | 292 | 293 | ## Useful commands 294 | 295 | * `cdk ls` list all stacks in the app 296 | * `cdk synth` emits the synthesized CloudFormation template 297 | * `cdk deploy` deploy this stack to your default AWS account/region 298 | * `cdk diff` compare deployed stack with current state 299 | * `cdk docs` open CDK documentation 300 | 301 | Enjoy! 302 | 303 | ## References 304 | 305 | * [Web Analytics](https://en.wikipedia.org/wiki/Web_analytics) 306 | * [Tutorial: Create a REST API as an Amazon Kinesis proxy in API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html) 307 | * [Streaming Data Solution for Amazon Kinesis](https://aws.amazon.com/ko/solutions/implementations/aws-streaming-data-solution-for-amazon-kinesis/) 308 |
309 | 310 |
311 | * [(AWS Developer Guide) Deliver data to Apache Iceberg Tables with Amazon Data Firehose](https://docs.aws.amazon.com/firehose/latest/dev/apache-iceberg-destination.html) 312 | * [Building fine-grained authorization using Amazon Cognito, API Gateway, and IAM](https://aws.amazon.com/ko/blogs/security/building-fine-grained-authorization-using-amazon-cognito-api-gateway-and-iam/) 313 | * [AWS Lake Formation - Create a data lake administrator](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin) 314 | * [AWS Lake Formation Permissions Reference](https://docs.aws.amazon.com/lake-formation/latest/dg/lf-permissions-reference.html) 315 | * [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html) 316 | * [Amazon Athena Workshop](https://athena-in-action.workshop.aws/) 317 | * [Curl Cookbook](https://catonmat.net/cookbooks/curl) 318 | * [fastavro](https://fastavro.readthedocs.io/) - Fast read/write of `AVRO` files 319 | * [Apache Avro Specification](https://avro.apache.org/docs/current/spec.html) 320 | * [How to create a Lambda layer using a simulated Lambda environment with Docker](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/) 321 | ``` 322 | $ cat < requirements-Lambda-Layer.txt 323 | > fastavro==1.6.1 324 | > EOF 325 | $ docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install -r requirements-Lambda-Layer.txt -t python/lib/python3.11/site-packages/; exit" 326 | $ zip -r fastavro-lib.zip python > /dev/null 327 | $ aws s3 mb s3://my-bucket-for-lambda-layer-packages 328 | $ aws s3 cp fastavro-lib.zip s3://my-bucket-for-lambda-layer-packages/ 329 | ``` 330 | 331 | ## Security 332 | 333 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 334 | 335 | ## License 336 | 337 | This library is licensed under the MIT-0 License. See the LICENSE file. 338 | 339 | -------------------------------------------------------------------------------- /web-analytics-iceberg/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import aws_cdk as cdk 5 | 6 | from cdk_stacks import ( 7 | KdsProxyApiGwStack, 8 | KdsStack, 9 | FirehoseToIcebergStack, 10 | FirehoseRoleStack, 11 | FirehoseDataProcLambdaStack, 12 | DataLakePermissionsStack, 13 | S3BucketStack, 14 | ) 15 | 16 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), 17 | region=os.getenv('CDK_DEFAULT_REGION')) 18 | 19 | app = cdk.App() 20 | 21 | kds_proxy_apigw = KdsProxyApiGwStack(app, 'WebAnalyticsKdsProxyApiGw') 22 | kds_stack = KdsStack(app, 'WebAnalyticsKinesisStream') 23 | 24 | s3_dest_bucket = S3BucketStack(app, 'WebAnalyticsDataFirehoseToIcebergS3Path', 25 | env=AWS_ENV) 26 | s3_dest_bucket.add_dependency(kds_stack) 27 | 28 | firehose_data_transform_lambda = FirehoseDataProcLambdaStack(app, 29 | 'WebAnalyticsFirehoseDataTransformLambdaStack', 30 | env=AWS_ENV 31 | ) 32 | firehose_data_transform_lambda.add_dependency(s3_dest_bucket) 33 | 34 | firehose_role = FirehoseRoleStack(app, 'WebAnalyticsFirehoseToIcebergRoleStack', 35 | firehose_data_transform_lambda.data_proc_lambda_fn, 36 | kds_stack.kinesis_stream, 37 | s3_dest_bucket.s3_bucket, 38 | env=AWS_ENV 39 | ) 40 | firehose_role.add_dependency(firehose_data_transform_lambda) 41 | 42 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnFirehoseRole', 43 | firehose_role.firehose_role, 44 | env=AWS_ENV 45 | ) 46 | grant_lake_formation_permissions.add_dependency(firehose_role) 47 | 48 | firehose_stack = FirehoseToIcebergStack(app, 'WebAnalyticsFirehoseToIcebergStack', 49 | firehose_data_transform_lambda.data_proc_lambda_fn, 50 | kds_stack.kinesis_stream, 51 | s3_dest_bucket.s3_bucket, 52 | firehose_role.firehose_role, 53 | env=AWS_ENV 54 | ) 55 | firehose_stack.add_dependency(grant_lake_formation_permissions) 56 | 57 | app.synth() 58 | -------------------------------------------------------------------------------- /web-analytics-iceberg/assets/amazon-athena-switching-to-workgroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/amazon-athena-switching-to-workgroup.png -------------------------------------------------------------------------------- /web-analytics-iceberg/assets/wa-iceberg-data-level-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-01.png -------------------------------------------------------------------------------- /web-analytics-iceberg/assets/wa-iceberg-data-level-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-02.png -------------------------------------------------------------------------------- /web-analytics-iceberg/assets/wa-iceberg-data-level-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-03.png -------------------------------------------------------------------------------- /web-analytics-iceberg/assets/wa-iceberg-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-table.png -------------------------------------------------------------------------------- /web-analytics-iceberg/build-aws-lambda-layer-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash - 2 | 3 | VERSION=1.10.0 4 | PY_VERSION=3.11 5 | LAMBDA_LAYER_NAME=fastavro-lib-${VERSION}-py-${PY_VERSION} 6 | S3_PATH=$1 7 | 8 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install fastavro==${VERSION} -t python/lib/python3.11/site-packages/; exit" 9 | 10 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null 11 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip 12 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip" 13 | 14 | -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .apigw import KdsProxyApiGwStack 2 | from .firehose_to_iceberg import FirehoseToIcebergStack 3 | from .firehose_role import FirehoseRoleStack 4 | from .firehose_data_proc_lambda import FirehoseDataProcLambdaStack 5 | from .kds import KdsStack 6 | from .lake_formation import DataLakePermissionsStack 7 | from .s3 import S3BucketStack -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/apigw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import json 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_apigateway, 12 | aws_iam, 13 | ) 14 | from constructs import Construct 15 | 16 | 17 | class KdsProxyApiGwStack(Stack): 18 | 19 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 20 | super().__init__(scope, construct_id, **kwargs) 21 | 22 | apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument() 23 | apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 24 | "effect": aws_iam.Effect.ALLOW, 25 | "resources": ["*"], 26 | "actions": [ 27 | "kinesis:DescribeStream", 28 | "kinesis:PutRecord", 29 | "kinesis:PutRecords"] 30 | })) 31 | 32 | apigw_kds_role = aws_iam.Role(self, "APIGatewayRoleToAccessKinesisDataStreams", 33 | role_name='APIGatewayRoleToAccessKinesisDataStreams', 34 | assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'), 35 | inline_policies={ 36 | 'KinesisWriteAccess': apigw_kds_access_role_policy_doc 37 | }, 38 | managed_policies=[ 39 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess') 40 | ] 41 | ) 42 | 43 | #XXX: Start to create an API as a Kinesis proxy 44 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy 45 | kds_proxy_api = aws_apigateway.RestApi(self, "KdsProxyAPI", 46 | rest_api_name="log-collector", 47 | description="An Amazon API Gateway REST API that integrated with an Amazon Kinesis Data Streams.", 48 | endpoint_types=[aws_apigateway.EndpointType.REGIONAL], 49 | default_cors_preflight_options={ 50 | "allow_origins": aws_apigateway.Cors.ALL_ORIGINS 51 | }, 52 | deploy=True, 53 | deploy_options=aws_apigateway.StageOptions(stage_name="v1"), 54 | endpoint_export_name="KdsProxyAPIEndpoint" 55 | ) 56 | 57 | apigw_error_responses = [ 58 | aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"), 59 | aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}") 60 | ] 61 | 62 | #XXX: GET /streams 63 | # List Kinesis streams by using the API Gateway console 64 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams 65 | 66 | streams_resource = kds_proxy_api.root.add_resource("streams") 67 | 68 | list_streams_options = aws_apigateway.IntegrationOptions( 69 | credentials_role=apigw_kds_role, 70 | integration_responses=[ 71 | aws_apigateway.IntegrationResponse( 72 | status_code="200" 73 | ), 74 | *apigw_error_responses 75 | ], 76 | request_templates={ 77 | 'application/json': '{}' 78 | }, 79 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 80 | ) 81 | 82 | list_streams_integration = aws_apigateway.AwsIntegration( 83 | service='kinesis', 84 | action='ListStreams', 85 | integration_http_method='POST', 86 | options=list_streams_options 87 | ) 88 | 89 | streams_resource.add_method("GET", list_streams_integration, 90 | # Default `authorization_type`: - open access unless `authorizer` is specified 91 | authorization_type=aws_apigateway.AuthorizationType.NONE, 92 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 93 | response_models={ 94 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 95 | } 96 | ), 97 | aws_apigateway.MethodResponse(status_code='400'), 98 | aws_apigateway.MethodResponse(status_code='500') 99 | ]) 100 | 101 | #XXX: GET /streams/{stream-name} 102 | # Describe a stream in Kinesis 103 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream 104 | one_stream_resource = streams_resource.add_resource("{stream-name}") 105 | 106 | describe_stream_options = aws_apigateway.IntegrationOptions( 107 | credentials_role=apigw_kds_role, 108 | integration_responses=[ 109 | aws_apigateway.IntegrationResponse( 110 | status_code="200" 111 | ), 112 | *apigw_error_responses 113 | ], 114 | request_templates={ 115 | 'application/json': json.dumps({ 116 | "StreamName": "$input.params('stream-name')" 117 | }, indent=2) 118 | }, 119 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 120 | ) 121 | 122 | describe_stream_integration = aws_apigateway.AwsIntegration( 123 | service='kinesis', 124 | action='DescribeStream', 125 | integration_http_method='POST', 126 | options=describe_stream_options 127 | ) 128 | 129 | one_stream_resource.add_method("GET", describe_stream_integration, 130 | # Default `authorization_type`: - open access unless `authorizer` is specified 131 | authorization_type=aws_apigateway.AuthorizationType.NONE, 132 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 133 | response_models={ 134 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 135 | } 136 | ), 137 | aws_apigateway.MethodResponse(status_code='400'), 138 | aws_apigateway.MethodResponse(status_code='500') 139 | ]) 140 | 141 | #XXX: PUT /streams/{stream-name}/record 142 | # Put a record into a stream in Kinesis 143 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 144 | record_resource = one_stream_resource.add_resource("record") 145 | 146 | put_record_request_mapping_templates = ''' 147 | { 148 | "StreamName": "$input.params('stream-name')", 149 | "Data": "$util.base64Encode($input.json('$.Data'))", 150 | "PartitionKey": "$input.path('$.PartitionKey')" 151 | } 152 | ''' 153 | 154 | put_record_options = aws_apigateway.IntegrationOptions( 155 | credentials_role=apigw_kds_role, 156 | integration_responses=[ 157 | aws_apigateway.IntegrationResponse( 158 | status_code="200" 159 | ), 160 | *apigw_error_responses 161 | ], 162 | request_templates={ 163 | 'application/json': put_record_request_mapping_templates 164 | }, 165 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 166 | ) 167 | 168 | put_record_integration = aws_apigateway.AwsIntegration( 169 | service='kinesis', 170 | action='PutRecord', 171 | integration_http_method='POST', 172 | options=put_record_options 173 | ) 174 | 175 | record_resource.add_method("PUT", put_record_integration, 176 | # Default `authorization_type`: - open access unless `authorizer` is specified 177 | authorization_type=aws_apigateway.AuthorizationType.NONE, 178 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 179 | response_models={ 180 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 181 | } 182 | ), 183 | aws_apigateway.MethodResponse(status_code='400'), 184 | aws_apigateway.MethodResponse(status_code='500') 185 | ]) 186 | 187 | 188 | #XXX: PUT /streams/{stream-name}/records 189 | # Put records into a stream in Kinesis 190 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 191 | records_resource = one_stream_resource.add_resource("records") 192 | 193 | put_records_request_mapping_templates = ''' 194 | { 195 | "StreamName": "$input.params('stream-name')", 196 | "Records": [ 197 | #foreach($elem in $input.path('$.records')) 198 | { 199 | "Data": "$util.base64Encode($elem.data)", 200 | "PartitionKey": "$elem.partition-key" 201 | }#if($foreach.hasNext),#end 202 | #end 203 | ] 204 | } 205 | ''' 206 | 207 | put_records_options = aws_apigateway.IntegrationOptions( 208 | credentials_role=apigw_kds_role, 209 | integration_responses=[ 210 | aws_apigateway.IntegrationResponse( 211 | status_code="200" 212 | ), 213 | *apigw_error_responses 214 | ], 215 | request_templates={ 216 | 'application/json': put_records_request_mapping_templates 217 | }, 218 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 219 | ) 220 | 221 | put_records_integration = aws_apigateway.AwsIntegration( 222 | service='kinesis', 223 | action='PutRecords', 224 | integration_http_method='POST', 225 | options=put_records_options 226 | ) 227 | 228 | records_resource.add_method("PUT", put_records_integration, 229 | # Default `authorization_type`: - open access unless `authorizer` is specified 230 | authorization_type=aws_apigateway.AuthorizationType.NONE, 231 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 232 | response_models={ 233 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 234 | } 235 | ), 236 | aws_apigateway.MethodResponse(status_code='400'), 237 | aws_apigateway.MethodResponse(status_code='500') 238 | ]) 239 | 240 | cdk.CfnOutput(self, 'KdsRestApiName', 241 | value=kds_proxy_api.rest_api_name, 242 | export_name=f'{self.stack_name}-KdsProxyRestApiName') -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/firehose_data_proc_lambda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_lambda, 12 | aws_logs, 13 | aws_s3 as s3 14 | ) 15 | from constructs import Construct 16 | 17 | 18 | class FirehoseDataProcLambdaStack(Stack): 19 | 20 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 21 | super().__init__(scope, construct_id, **kwargs) 22 | 23 | firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda') 24 | LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name'] 25 | LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key'] 26 | 27 | s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET) 28 | lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib", 29 | layer_version_name="fastavro-lib", 30 | compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11], 31 | code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY) 32 | ) 33 | 34 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 35 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 36 | dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None) 37 | dest_iceberg_table_unique_keys = ",".join(dest_iceberg_table_unique_keys) if dest_iceberg_table_unique_keys else "" 38 | 39 | LAMBDA_FN_NAME = "WebAnalyticsFirehoseToIcebergTransformer" 40 | self.data_proc_lambda_fn = aws_lambda.Function(self, "FirehoseToIcebergTransformer", 41 | runtime=aws_lambda.Runtime.PYTHON_3_11, 42 | function_name=LAMBDA_FN_NAME, 43 | handler="firehose_to_iceberg_transformer.lambda_handler", 44 | description="Transform records to Apache Iceberg table", 45 | code=aws_lambda.Code.from_asset(os.path.join(os.path.dirname(__file__), '../src/main/python/IcebergTransformer')), 46 | environment={ 47 | "IcebergDatabaseName": dest_iceberg_table_config["database_name"], 48 | "IcebergTableName": dest_iceberg_table_config["table_name"], 49 | "IcebergTableUniqueKeys": dest_iceberg_table_unique_keys 50 | }, 51 | timeout=cdk.Duration.minutes(5), 52 | #XXX: set memory size appropriately 53 | memory_size=256, 54 | layers=[lambda_lib_layer] 55 | ) 56 | 57 | log_group = aws_logs.LogGroup(self, "FirehoseToIcebergTransformerLogGroup", 58 | #XXX: Circular dependency between resources occurs 59 | # if aws_lambda.Function.function_name is used 60 | # instead of literal name of lambda function such as "FirehoseToIcebergTransformer" 61 | log_group_name=f"/aws/lambda/{LAMBDA_FN_NAME}", 62 | retention=aws_logs.RetentionDays.THREE_DAYS, 63 | removal_policy=cdk.RemovalPolicy.DESTROY 64 | ) 65 | log_group.grant_write(self.data_proc_lambda_fn) 66 | 67 | 68 | cdk.CfnOutput(self, 'FirehoseDataProcFuncName', 69 | value=self.data_proc_lambda_fn.function_name, 70 | export_name=f'{self.stack_name}-FirehoseDataProcFuncName') -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/firehose_role.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_iam 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class FirehoseRoleStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, 17 | data_transform_lambda_fn, source_kinesis_stream, s3_bucket, 18 | **kwargs) -> None: 19 | 20 | super().__init__(scope, construct_id, **kwargs) 21 | 22 | firehose_role_policy_doc = aws_iam.PolicyDocument() 23 | 24 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 25 | effect=aws_iam.Effect.ALLOW, 26 | resources=[source_kinesis_stream.stream_arn], 27 | actions=[ 28 | "kinesis:DescribeStream", 29 | "kinesis:GetShardIterator", 30 | "kinesis:GetRecords", 31 | "kinesis:ListShards" 32 | ] 33 | )) 34 | 35 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 36 | "effect": aws_iam.Effect.ALLOW, 37 | "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)], 38 | "actions": [ 39 | "s3:AbortMultipartUpload", 40 | "s3:GetBucketLocation", 41 | "s3:GetObject", 42 | "s3:ListBucket", 43 | "s3:ListBucketMultipartUploads", 44 | "s3:PutObject", 45 | "s3:DeleteObject" 46 | ] 47 | })) 48 | 49 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 50 | "effect": aws_iam.Effect.ALLOW, 51 | "resources": [ 52 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog", 53 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:database/*", 54 | f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:table/*/*" 55 | ], 56 | "actions": [ 57 | "glue:GetTable", 58 | "glue:GetDatabase", 59 | "glue:UpdateTable" 60 | ] 61 | })) 62 | 63 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 64 | effect=aws_iam.Effect.ALLOW, 65 | resources=["*"], 66 | actions=[ 67 | "ec2:DescribeVpcs", 68 | "ec2:DescribeVpcAttribute", 69 | "ec2:DescribeSubnets", 70 | "ec2:DescribeSecurityGroups", 71 | "ec2:DescribeNetworkInterfaces", 72 | "ec2:CreateNetworkInterface", 73 | "ec2:CreateNetworkInterfacePermission", 74 | "ec2:DeleteNetworkInterface" 75 | ] 76 | )) 77 | 78 | #XXX: https://docs.aws.amazon.com/ko_kr/cdk/latest/guide/tokens.html 79 | # String-encoded tokens: 80 | # Avoid manipulating the string in other ways. For example, 81 | # taking a substring of a string is likely to break the string token. 82 | firehose_log_group_name = f"/aws/kinesisfirehose/{source_kinesis_stream.stream_name}" 83 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 84 | effect=aws_iam.Effect.ALLOW, 85 | #XXX: The ARN will be formatted as follows: 86 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name} 87 | resources=[self.format_arn(service="logs", resource="log-group", 88 | resource_name="{}:log-stream:*".format(firehose_log_group_name), 89 | arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)], 90 | actions=["logs:PutLogEvents"] 91 | )) 92 | 93 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 94 | "effect": aws_iam.Effect.ALLOW, 95 | "resources": [f"{data_transform_lambda_fn.function_arn}:*"], 96 | "actions": [ 97 | "lambda:InvokeFunction", 98 | "lambda:GetFunctionConfiguration" 99 | ] 100 | })) 101 | 102 | self.firehose_role = aws_iam.Role(self, "KinesisFirehoseServiceRole", 103 | role_name=f"KinesisFirehoseServiceRole-{source_kinesis_stream.stream_name}-{self.region}", 104 | assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"), 105 | #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221 106 | inline_policies={ 107 | "firehose_role_policy": firehose_role_policy_doc 108 | } 109 | ) 110 | 111 | 112 | cdk.CfnOutput(self, 'FirehoseRole', 113 | value=self.firehose_role.role_name, 114 | export_name=f'{self.stack_name}-Role') 115 | cdk.CfnOutput(self, 'FirehoseRoleArn', 116 | value=self.firehose_role.role_arn, 117 | export_name=f'{self.stack_name}-RoleArn') -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/firehose_to_iceberg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_s3 as s3, 10 | aws_kinesisfirehose 11 | ) 12 | from constructs import Construct 13 | 14 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn_delivery_stream 15 | 16 | 17 | class FirehoseToIcebergStack(Stack): 18 | 19 | def __init__(self, scope: Construct, construct_id: str, 20 | data_transform_lambda_fn, source_kinesis_stream, s3_bucket, 21 | firehose_role, **kwargs) -> None: 22 | 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 26 | 27 | firehose_log_group_name = f"/aws/kinesisfirehose/{source_kinesis_stream.stream_name}" 28 | 29 | firehose_buffering_hints = data_firehose_configuration["buffering_hints"] 30 | firehose_buffer_size = firehose_buffering_hints["size_in_mbs"] 31 | firehose_buffer_interval = firehose_buffering_hints["interval_in_seconds"] 32 | 33 | transform_records_with_aws_lambda = data_firehose_configuration["transform_records_with_aws_lambda"] 34 | firehose_lambda_buffer_size = transform_records_with_aws_lambda["buffer_size"] 35 | firehose_lambda_buffer_interval = transform_records_with_aws_lambda["buffer_interval"] 36 | firehose_lambda_number_of_retries = transform_records_with_aws_lambda["number_of_retries"] 37 | 38 | s3_output_prefix = data_firehose_configuration["output_prefix"] 39 | s3_error_output_prefix = data_firehose_configuration["error_output_prefix"] 40 | 41 | lambda_proc = cfn_delivery_stream.ProcessorProperty( 42 | type="Lambda", 43 | parameters=[ 44 | cfn_delivery_stream.ProcessorParameterProperty( 45 | parameter_name="LambdaArn", 46 | parameter_value='{}:{}'.format( 47 | data_transform_lambda_fn.function_arn, 48 | data_transform_lambda_fn.latest_version.version 49 | ) 50 | ), 51 | cfn_delivery_stream.ProcessorParameterProperty( 52 | parameter_name="NumberOfRetries", 53 | parameter_value=str(firehose_lambda_number_of_retries) 54 | ), 55 | cfn_delivery_stream.ProcessorParameterProperty( 56 | parameter_name="RoleArn", 57 | parameter_value=firehose_role.role_arn 58 | ), 59 | cfn_delivery_stream.ProcessorParameterProperty( 60 | parameter_name="BufferSizeInMBs", 61 | parameter_value=str(firehose_lambda_buffer_size) 62 | ), 63 | cfn_delivery_stream.ProcessorParameterProperty( 64 | parameter_name="BufferIntervalInSeconds", 65 | parameter_value=str(firehose_lambda_buffer_interval) 66 | ) 67 | ] 68 | ) 69 | 70 | firehose_processing_config = cfn_delivery_stream.ProcessingConfigurationProperty( 71 | enabled=True, 72 | processors=[ 73 | lambda_proc 74 | ] 75 | ) 76 | 77 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 78 | dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None) 79 | dest_iceberg_table_unique_keys = dest_iceberg_table_unique_keys if dest_iceberg_table_unique_keys else None 80 | 81 | iceberg_dest_config = cfn_delivery_stream.IcebergDestinationConfigurationProperty( 82 | catalog_configuration=cfn_delivery_stream.CatalogConfigurationProperty( 83 | catalog_arn=f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog" 84 | ), 85 | role_arn=firehose_role.role_arn, 86 | s3_configuration=cfn_delivery_stream.S3DestinationConfigurationProperty( 87 | bucket_arn=s3_bucket.bucket_arn, 88 | role_arn=firehose_role.role_arn, 89 | buffering_hints={ 90 | "intervalInSeconds": firehose_buffer_interval, 91 | "sizeInMBs": firehose_buffer_size 92 | }, 93 | cloud_watch_logging_options={ 94 | "enabled": True, 95 | "logGroupName": firehose_log_group_name, 96 | "logStreamName": "DestinationDelivery" 97 | }, 98 | compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP] 99 | error_output_prefix=s3_error_output_prefix, 100 | prefix=s3_output_prefix, 101 | ), 102 | buffering_hints={ 103 | "intervalInSeconds": firehose_buffer_interval, 104 | "sizeInMBs": firehose_buffer_size 105 | }, 106 | cloud_watch_logging_options={ 107 | "enabled": True, 108 | "logGroupName": firehose_log_group_name, 109 | "logStreamName": "DestinationDelivery" 110 | }, 111 | destination_table_configuration_list=[ 112 | cfn_delivery_stream.DestinationTableConfigurationProperty( 113 | destination_database_name=dest_iceberg_table_config["database_name"], 114 | destination_table_name=dest_iceberg_table_config["table_name"], 115 | unique_keys=dest_iceberg_table_unique_keys 116 | ) 117 | ], 118 | processing_configuration=firehose_processing_config, 119 | s3_backup_mode='FailedDataOnly' 120 | ) 121 | 122 | _ = aws_kinesisfirehose.CfnDeliveryStream(self, "FirehoseToIceberg", 123 | delivery_stream_name=source_kinesis_stream.stream_name, 124 | delivery_stream_type="KinesisStreamAsSource", 125 | kinesis_stream_source_configuration={ 126 | "kinesisStreamArn": source_kinesis_stream.stream_arn, 127 | "roleArn": firehose_role.role_arn 128 | }, 129 | iceberg_destination_configuration=iceberg_dest_config, 130 | tags=[{"key": "Name", "value": source_kinesis_stream.stream_name}] 131 | ) 132 | 133 | 134 | cdk.CfnOutput(self, 'SourceKinesisStreamName', 135 | value=source_kinesis_stream.stream_name, 136 | export_name=f'{self.stack_name}-SourceKinesisStreamName') 137 | cdk.CfnOutput(self, 'S3DestBucket', 138 | value=s3_bucket.bucket_name, 139 | export_name=f'{self.stack_name}-S3DestBucket') -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/kds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Duration, 12 | Stack, 13 | aws_kinesis, 14 | ) 15 | from constructs import Construct 16 | 17 | random.seed(31) 18 | 19 | 20 | class KdsStack(Stack): 21 | 22 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | KINESIS_DEFAULT_STREAM_NAME = 'PUT-Firehose-{}'.format(''.join(random.sample((string.ascii_letters), k=5))) 26 | KINESIS_STREAM_NAME = self.node.try_get_context('kinesis_stream_name') or KINESIS_DEFAULT_STREAM_NAME 27 | 28 | self.kinesis_stream = aws_kinesis.Stream(self, "SourceKinesisStreams", 29 | retention_period=Duration.hours(24), 30 | stream_mode=aws_kinesis.StreamMode.ON_DEMAND, 31 | stream_name=KINESIS_STREAM_NAME) 32 | 33 | cdk.CfnOutput(self, 'KinesisDataStreamName', 34 | value=self.kinesis_stream.stream_name, 35 | export_name=f'{self.stack_name}-KinesisDataStreamName') 36 | -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/lake_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_lakeformation 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class DataLakePermissionsStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, firehose_role, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 20 | dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"] 21 | database_name=dest_iceberg_table_config["database_name"] 22 | 23 | #XXXX: The role assumed by cdk is not a data lake administrator. 24 | # So, deploying PrincipalPermissions meets the error such as: 25 | # "Resource does not exist or requester is not authorized to access requested permissions." 26 | # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator. 27 | # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68 28 | cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings", 29 | admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( 30 | data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn) 31 | )] 32 | ) 33 | 34 | cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions", 35 | permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"], 36 | permissions_with_grant_option=[], 37 | principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty( 38 | data_lake_principal_identifier=firehose_role.role_arn 39 | ), 40 | resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty( 41 | #XXX: Can't specify a TableWithColumns resource and a Table resource 42 | table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty( 43 | catalog_id=cdk.Aws.ACCOUNT_ID, 44 | database_name=database_name, 45 | # name="ALL_TABLES", 46 | table_wildcard={} 47 | ) 48 | ) 49 | ) 50 | cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 51 | 52 | #XXX: In order to keep resource destruction order, 53 | # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions 54 | cfn_principal_permissions.add_dependency(cfn_data_lake_settings) 55 | 56 | 57 | cdk.CfnOutput(self, 'Principal', 58 | value=cfn_principal_permissions.attr_principal_identifier, 59 | export_name=f'{self.stack_name}-Principal') -------------------------------------------------------------------------------- /web-analytics-iceberg/cdk_stacks/s3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_s3 as s3 10 | ) 11 | 12 | from constructs import Construct 13 | 14 | 15 | class S3BucketStack(Stack): 16 | 17 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 18 | super().__init__(scope, construct_id, **kwargs) 19 | 20 | S3_DEFAULT_BUCKET_NAME = f"web-analytics-{self.region}-{self.account}" 21 | data_firehose_configuration = self.node.try_get_context("data_firehose_configuration") 22 | s3_bucket_name = data_firehose_configuration.get('s3_bucket_name', S3_DEFAULT_BUCKET_NAME) 23 | 24 | self.s3_bucket = s3.Bucket(self, "s3bucket", 25 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned 26 | bucket_name=s3_bucket_name) 27 | 28 | 29 | cdk.CfnOutput(self, 'S3BucketName', 30 | value=self.s3_bucket.bucket_name, 31 | export_name=f'{self.stack_name}-S3BucketName') -------------------------------------------------------------------------------- /web-analytics-iceberg/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.24.41 2 | mimesis==18.0.0 3 | requests>=2.31.0 4 | 5 | # packages for Lambda Layer 6 | fastavro==1.10.0 7 | -------------------------------------------------------------------------------- /web-analytics-iceberg/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.185.0 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /web-analytics-iceberg/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /web-analytics-iceberg/src/main/python/IcebergTransformer/firehose_to_iceberg_transformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import base64 6 | import collections 7 | import json 8 | import logging 9 | import os 10 | from datetime import datetime 11 | 12 | import fastavro 13 | 14 | 15 | LOGGER = logging.getLogger() 16 | if len(LOGGER.handlers) > 0: 17 | # The Lambda environment pre-configures a handler logging to stderr. 18 | # If a handler is already configured, `.basicConfig` does not execute. 19 | # Thus we set the level directly. 20 | LOGGER.setLevel(logging.INFO) 21 | else: 22 | logging.basicConfig(level=logging.INFO) 23 | 24 | 25 | DESTINATION_DATABASE_NAME = os.environ['IcebergDatabaseName'] 26 | DESTINATION_TABLE_NAME = os.environ['IcebergTableName'] 27 | DESTINATION_TABLE_UNIQUE_KEYS = os.environ.get('IcebergTableUniqueKeys', None) 28 | 29 | ORIGINAL_SCHEMA = { 30 | 'name': 'WebLogs', 31 | 'type': 'record', 32 | 'fields': [ 33 | { 34 | 'name': 'user_id', 35 | 'type': 'string' 36 | }, 37 | { 38 | 'name': 'session_id', 39 | 'type': 'string' 40 | }, 41 | { 42 | 'name': 'event', 43 | 'type': 'string' 44 | }, 45 | { 46 | 'name': 'referrer', 47 | 'type': ['string', 'null'] 48 | }, 49 | { 50 | 'name': 'user_agent', 51 | 'type': ['string', 'null'] 52 | }, 53 | { 54 | 'name': 'ip', 55 | 'type': 'string' 56 | }, 57 | { 58 | 'name': 'hostname', 59 | 'type': 'string' 60 | }, 61 | { 62 | 'name': 'os', 63 | 'type': ['string', 'null'] 64 | }, 65 | { 66 | 'name': 'timestamp', 67 | 'type': { 68 | 'type': 'string', 69 | 'logicalType': 'datetime' 70 | } 71 | }, 72 | { 73 | 'name': 'uri', 74 | 'type': 'string' 75 | } 76 | ] 77 | } 78 | 79 | 80 | def read_datetime(data, writer_schema=None, reader_schema=None): 81 | return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 82 | 83 | 84 | def prepare_datetime(data, schema): 85 | """Converts datetime.datetime to string representing the date and time""" 86 | if isinstance(data, datetime): 87 | return datetime.strftime('%Y-%m-%dT%H:%M:%SZ') 88 | else: 89 | try: 90 | dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 91 | return dt.strftime('%Y-%m-%dT%H:%M:%SZ') 92 | except Exception as ex: 93 | return None 94 | 95 | 96 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime 97 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime 98 | 99 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA) 100 | 101 | 102 | def check_schema(record): 103 | try: 104 | return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False) 105 | except Exception as ex: 106 | LOGGER.error(ex) 107 | return False 108 | 109 | 110 | def lambda_handler(event, context): 111 | counter = collections.Counter(total=0, valid=0, invalid=0) 112 | firehose_records_output = {'records': []} 113 | 114 | unique_keys_exist = True if DESTINATION_TABLE_UNIQUE_KEYS else False 115 | otf_metadata_operation = 'insert' if not unique_keys_exist else 'update' 116 | 117 | for record in event['records']: 118 | counter['total'] += 1 119 | 120 | payload = base64.b64decode(record['data']).decode('utf-8') 121 | json_value = json.loads(payload) 122 | 123 | #XXX: check if schema is valid 124 | is_valid = check_schema(json_value) 125 | counter['valid' if is_valid else 'invalid'] += 1 126 | 127 | firehose_record = { 128 | 'data': base64.b64encode(payload.encode('utf-8')), 129 | 'recordId': record['recordId'], 130 | 'result': 'Ok' if is_valid else 'ProcessingFailed', # [Ok, Dropped, ProcessingFailed] 131 | 'metadata': { 132 | 'otfMetadata': { 133 | 'destinationDatabaseName': DESTINATION_DATABASE_NAME, 134 | 'destinationTableName': DESTINATION_TABLE_NAME, 135 | 'operation': otf_metadata_operation 136 | } 137 | } 138 | } 139 | 140 | firehose_records_output['records'].append(firehose_record) 141 | 142 | LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items())) 143 | 144 | return firehose_records_output 145 | 146 | 147 | if __name__ == '__main__': 148 | import pprint 149 | 150 | record_list = [ 151 | ('Ok', { 152 | "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720", 153 | "session_id": "a5aa20a72c9e37588f9bbeaa", 154 | "event": "view", 155 | "referrer": "brandon.biz", 156 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 157 | "ip": "202.165.71.49", 158 | "hostname": "toxic.tokyo", 159 | "os": "openSUSE", 160 | "timestamp": "2022-09-16T07:35:46Z", 161 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 162 | }), 163 | ('Ok', { 164 | "user_id": "70b1f606-aa63-47fb-bc92-76de9c59d064", 165 | "session_id": "928e78473db8449b17644b2c", 166 | "event": "like", 167 | # missing optional data 168 | # "referrer": "toe.gq", 169 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53", 170 | "ip": "12.166.113.176", 171 | "hostname": "drivers.glass", 172 | "os": "Windows 8.1", 173 | "timestamp": "2022-09-16T07:52:47Z", 174 | "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members" 175 | }), 176 | ('ProcessingFailed', { 177 | "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720", 178 | "session_id": "a5aa20a72c9e37588f9bbeaa", 179 | "event": "cart", 180 | "referrer": "brandon.biz", 181 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 182 | "ip": "202.165.71.49", 183 | "hostname": "toxic.tokyo", 184 | "os": "openSUSE", 185 | # invalid datetime format 186 | "timestamp": "2022-09-16 07:35:46", 187 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 188 | }), 189 | ('ProcessingFailed', { 190 | # missing required data 191 | # "user_id": "045e63c7-b276-4117-9706-7c2e3b87d5f5", 192 | "session_id": "abfd47eb7dd7b8aeec0555a7", 193 | "event": "purchase", 194 | "referrer": "transfer.edu", 195 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50", 196 | "ip": "170.128.148.234", 197 | "hostname": "propecia.tc", 198 | "os": "Lubuntu", 199 | "timestamp": "2022-09-16T07:46:04Z", 200 | "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel" 201 | }), 202 | ('ProcessingFailed', { 203 | "user_id": "e504cd9d-30da-497f-8f28-2b3f64220e16", 204 | "session_id": "fd4807ab825ee8bd950b1e8b", 205 | "event": "list", 206 | "referrer": "liquid.aquitaine", 207 | "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02", 208 | # mismatched data type 209 | "ip": 212234672, 210 | "hostname": "consequently.com", 211 | "os": "Gentoo", 212 | "timestamp": "2022-09-16T07:13:29Z", 213 | "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook" 214 | }), 215 | ('ProcessingFailed', { 216 | # mismatched column name 217 | "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720", 218 | # mismatched column name 219 | "sessionId": "a5aa20a72c9e37588f9bbeaa", 220 | "event": "visit", 221 | "referrer": "brandon.biz", 222 | # mismatched column name 223 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 224 | "ip": "202.165.71.49", 225 | "hostname": "toxic.tokyo", 226 | "os": "openSUSE", 227 | "timestamp": "2022-09-16T07:35:46Z", 228 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 229 | }) 230 | ] 231 | 232 | for correct_result, record in record_list: 233 | event = { 234 | "invocationId": "invocationIdExample", 235 | "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE", 236 | "region": "us-east-1", 237 | "records": [ 238 | { 239 | "recordId": "49546986683135544286507457936321625675700192471156785154", 240 | "approximateArrivalTimestamp": 1495072949453, 241 | "data": base64.b64encode(json.dumps(record).encode('utf-8')) 242 | } 243 | ] 244 | } 245 | 246 | res = lambda_handler(event, {}) 247 | print(f"\n>> {correct_result} == {res['records'][0]['result']}?", res['records'][0]['result'] == correct_result) 248 | pprint.pprint(res) 249 | -------------------------------------------------------------------------------- /web-analytics-iceberg/src/utils/gen_fake_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import sys 6 | import argparse 7 | from datetime import ( 8 | datetime, 9 | timezone 10 | ) 11 | import json 12 | import time 13 | import typing 14 | 15 | from mimesis.locales import Locale 16 | from mimesis.schema import Field, Schema 17 | from mimesis.providers.base import BaseProvider 18 | import requests 19 | 20 | 21 | class CustomDatetime(BaseProvider): 22 | class Meta: 23 | """Class for metadata.""" 24 | name: typing.Final[str] = "custom_datetime" 25 | 26 | def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: 27 | super().__init__(*args, **kwargs) 28 | 29 | def timestamp(self) -> str: 30 | utc_now = datetime.now(timezone.utc) 31 | minute = self.random.randint(0, 59) 32 | second = self.random.randint(0, 59) 33 | random_datetime = utc_now.replace(minute=minute, second=second) 34 | return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ") 35 | 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser() 39 | 40 | parser.add_argument('--api-url', help='log collector api url') 41 | parser.add_argument('--api-method', default='records', choices=['record', 'records'], 42 | help='log collector api method [record | records]') 43 | parser.add_argument('--stream-name', help='kinesis stream name') 44 | parser.add_argument('--max-count', default=15, type=int, help='max number of records to put') 45 | parser.add_argument('--dry-run', action='store_true') 46 | 47 | options = parser.parse_args() 48 | 49 | _field = Field(locale=Locale.EN) 50 | _field._generic.add_provider(CustomDatetime) 51 | 52 | schema_definition = lambda: { 53 | "user_id": _field("uuid"), 54 | "session_id": _field("token_hex", entropy=12), 55 | "event": _field("choice", items=['visit', 'view', 'list', 'like', 'cart', 'purchase']), 56 | "referrer": _field("internet.hostname"), 57 | "user_agent": _field("internet.user_agent"), 58 | "ip": _field("internet.ip_v4"), 59 | "hostname": _field("internet.hostname"), 60 | "os": _field("development.os"), 61 | "timestamp": _field("custom_datetime.timestamp"), 62 | "uri": _field("internet.uri", query_params_count=2) 63 | } 64 | schema = Schema(schema=schema_definition, iterations=options.max_count) 65 | 66 | log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None 67 | 68 | for record in schema: 69 | if options.dry_run: 70 | print(json.dumps(record), file=sys.stderr) 71 | continue 72 | 73 | partition_key = record['user_id'] 74 | if options.api_method == 'record': 75 | data = {'Data': record, 'PartitionKey': partition_key} 76 | payload = f'{json.dumps(data)}' 77 | else: 78 | #XXX: make sure data has newline 79 | data = {"records":[{'data': f'{json.dumps(record)}\n', 'partition-key': partition_key}]} 80 | payload = json.dumps(data) 81 | 82 | res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'}) 83 | if res.status_code == 200: 84 | print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr) 85 | else: 86 | print(f'[{res.status_code} {res.reason}]', file=sys.stderr) 87 | sys.exit(1) 88 | time.sleep(0.5) 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /web-analytics-iceberg/src/utils/kds_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import argparse 6 | import pprint 7 | import random 8 | import time 9 | 10 | import boto3 11 | 12 | random.seed(47) 13 | 14 | SHARD_ITER_TYPE = ('TRIM_HORIZON', 'LATEST') 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--stream-name', action="store", help='kinesis stream name') 20 | parser.add_argument('--shard-id', action="store", help='kinesis stream shard-id') 21 | parser.add_argument('--iter-type', choices=SHARD_ITER_TYPE, default='LATEST', 22 | help='kinesis stream shard iterator type: [{}]'.format(', '.join(SHARD_ITER_TYPE))) 23 | parser.add_argument('--region-name', action='store', default='us-east-1', 24 | help='aws region name (default: us-east-1)') 25 | 26 | options = parser.parse_args() 27 | 28 | stream_name, shard_iter_type = options.stream_name, options.iter_type 29 | 30 | kinesis_client = boto3.client('kinesis', region_name=options.region_name) 31 | response = kinesis_client.describe_stream(StreamName=stream_name) 32 | if options.shard_id: 33 | shard_id = options.shard_id 34 | else: 35 | shard_id_list = [e['ShardId'] for e in response['StreamDescription']['Shards']] 36 | shard_id = random.choice(shard_id_list) 37 | 38 | shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name, 39 | ShardId=shard_id, 40 | ShardIteratorType=shard_iter_type) 41 | 42 | shard_iter = shard_iterator['ShardIterator'] 43 | record_response = kinesis_client.get_records(ShardIterator=shard_iter, Limit=123) 44 | pprint.pprint(record_response.get('Records', []), indent=2) 45 | 46 | while 'NextShardIterator' in record_response: 47 | record_response = kinesis_client.get_records(ShardIterator=record_response['NextShardIterator'], Limit=123) 48 | pprint.pprint(record_response.get('Records', []), indent=2) 49 | 50 | # wait for a few seconds 51 | time.sleep(5) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | 56 | -------------------------------------------------------------------------------- /web-analytics-parquet/.example.cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "acknowledged-issue-numbers": [ 3 | 32775 4 | ], 5 | "firehose_data_tranform_lambda": { 6 | "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources", 7 | "s3_object_key": "var/fastavro-lib.zip" 8 | }, 9 | "firehose": { 10 | "buffer_size_in_mbs": 128, 11 | "buffer_interval_in_seconds": 300, 12 | "lambda_buffer_size_in_mbs": 3, 13 | "lambda_buffer_interval_in_seconds": 300, 14 | "lambda_number_of_retries": 3, 15 | "s3_output_folder": "json-data", 16 | "prefix": "json-data/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/", 17 | "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}" 18 | }, 19 | "merge_small_files_lambda_env": { 20 | "OLD_DATABASE": "mydatabase", 21 | "OLD_TABLE_NAME": "web_log_json", 22 | "NEW_DATABASE": "mydatabase", 23 | "NEW_TABLE_NAME": "web_log_parquet", 24 | "NEW_TABLE_S3_FOLDER_NAME": "parquet-data", 25 | "COLUMN_NAMES": "userId,sessionId,referrer,userAgent,ip,hostname,os,timestamp,uri" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /web-analytics-parquet/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | package-lock.json 4 | __pycache__ 5 | .pytest_cache 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /web-analytics-parquet/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /web-analytics-parquet/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /web-analytics-parquet/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /web-analytics-parquet/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from cdk_stacks import ( 10 | KdsProxyApiGwStack, 11 | KdsStack, 12 | FirehoseDataTransformLambdaStack, 13 | FirehoseStack, 14 | MergeSmallFilesLambdaStack, 15 | AthenaWorkGroupStack, 16 | AthenaNamedQueryStack, 17 | GlueCatalogDatabaseStack, 18 | DataLakePermissionsStack 19 | ) 20 | 21 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), 22 | region=os.getenv('CDK_DEFAULT_REGION')) 23 | 24 | app = cdk.App() 25 | 26 | kds_proxy_apigw = KdsProxyApiGwStack(app, 'WebAnalyticsKdsProxyApiGw') 27 | kds_stack = KdsStack(app, 'WebAnalyticsKinesisStream') 28 | 29 | firehose_data_transform_lambda = FirehoseDataTransformLambdaStack(app, 30 | 'WebAnalyticsFirehoseDataTransformLambda') 31 | firehose_data_transform_lambda.add_dependency(kds_stack) 32 | 33 | firehose_stack = FirehoseStack(app, 'WebAnalyticsFirehose', 34 | kds_stack.target_kinesis_stream.stream_arn, 35 | firehose_data_transform_lambda.schema_validator_lambda_fn) 36 | firehose_stack.add_dependency(firehose_data_transform_lambda) 37 | 38 | athena_work_group_stack = AthenaWorkGroupStack(app, 39 | 'WebAnalyticsAthenaWorkGroup' 40 | ) 41 | athena_work_group_stack.add_dependency(firehose_stack) 42 | 43 | merge_small_files_stack = MergeSmallFilesLambdaStack(app, 44 | 'WebAnalyticsMergeSmallFiles', 45 | firehose_stack.s3_dest_bucket_name, 46 | firehose_stack.s3_dest_folder_name, 47 | athena_work_group_stack.athena_work_group_name 48 | ) 49 | merge_small_files_stack.add_dependency(athena_work_group_stack) 50 | 51 | athena_databases = GlueCatalogDatabaseStack(app, 'WebAnalyticsGlueDatabases') 52 | athena_databases.add_dependency(merge_small_files_stack) 53 | 54 | lakeformation_grant_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnMergeFilesJob', 55 | merge_small_files_stack.lambda_exec_role 56 | ) 57 | lakeformation_grant_permissions.add_dependency(athena_databases) 58 | 59 | athena_named_query_stack = AthenaNamedQueryStack(app, 60 | 'WebAnalyticsAthenaNamedQueries', 61 | athena_work_group_stack.athena_work_group_name, 62 | merge_small_files_stack.s3_json_location, 63 | merge_small_files_stack.s3_parquet_location 64 | ) 65 | athena_named_query_stack.add_dependency(lakeformation_grant_permissions) 66 | 67 | app.synth() 68 | -------------------------------------------------------------------------------- /web-analytics-parquet/assets/amazon-athena-switching-to-workgroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-parquet/assets/amazon-athena-switching-to-workgroup.png -------------------------------------------------------------------------------- /web-analytics-parquet/assets/data-lake-formation-permissions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-parquet/assets/data-lake-formation-permissions.png -------------------------------------------------------------------------------- /web-analytics-parquet/build-aws-lambda-layer-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash - 2 | 3 | LAMBDA_LAYER_NAME=fastavro-lib 4 | S3_PATH=$1 5 | 6 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.9" /bin/sh -c "pip install fastavro==1.6.1 -t python/lib/python3.9/site-packages/; exit" 7 | 8 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null 9 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip 10 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip" 11 | 12 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .apigw import KdsProxyApiGwStack 2 | from .athena_workgroup import AthenaWorkGroupStack 3 | from .athena_named_query import AthenaNamedQueryStack 4 | from .firehose_dtata_transform_lambda import FirehoseDataTransformLambdaStack 5 | from .firehose import FirehoseStack 6 | from .kds import KdsStack 7 | from .merge_small_files_lambda import MergeSmallFilesLambdaStack 8 | from .vpc import VpcStack 9 | from .glue_catalog_database import GlueCatalogDatabaseStack 10 | from .lake_formation import DataLakePermissionsStack 11 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/apigw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import json 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_apigateway, 12 | aws_iam, 13 | ) 14 | from constructs import Construct 15 | 16 | 17 | class KdsProxyApiGwStack(Stack): 18 | 19 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 20 | super().__init__(scope, construct_id, **kwargs) 21 | 22 | apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument() 23 | apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 24 | "effect": aws_iam.Effect.ALLOW, 25 | "resources": ["*"], 26 | "actions": [ 27 | "kinesis:DescribeStream", 28 | "kinesis:PutRecord", 29 | "kinesis:PutRecords"] 30 | })) 31 | 32 | apigw_kds_role = aws_iam.Role(self, "APIGatewayRoleToAccessKinesisDataStreams", 33 | role_name='APIGatewayRoleToAccessKinesisDataStreams', 34 | assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'), 35 | inline_policies={ 36 | 'KinesisWriteAccess': apigw_kds_access_role_policy_doc 37 | }, 38 | managed_policies=[ 39 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess') 40 | ] 41 | ) 42 | 43 | #XXX: Start to create an API as a Kinesis proxy 44 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy 45 | kds_proxy_api = aws_apigateway.RestApi(self, "KdsProxyAPI", 46 | rest_api_name="log-collector", 47 | description="An Amazon API Gateway REST API that integrated with an Amazon Kinesis Data Streams.", 48 | endpoint_types=[aws_apigateway.EndpointType.REGIONAL], 49 | default_cors_preflight_options={ 50 | "allow_origins": aws_apigateway.Cors.ALL_ORIGINS 51 | }, 52 | deploy=True, 53 | deploy_options=aws_apigateway.StageOptions(stage_name="v1"), 54 | endpoint_export_name="KdsProxyAPIEndpoint" 55 | ) 56 | 57 | apigw_error_responses = [ 58 | aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"), 59 | aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}") 60 | ] 61 | 62 | #XXX: GET /streams 63 | # List Kinesis streams by using the API Gateway console 64 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams 65 | 66 | streams_resource = kds_proxy_api.root.add_resource("streams") 67 | 68 | list_streams_options = aws_apigateway.IntegrationOptions( 69 | credentials_role=apigw_kds_role, 70 | integration_responses=[ 71 | aws_apigateway.IntegrationResponse( 72 | status_code="200" 73 | ), 74 | *apigw_error_responses 75 | ], 76 | request_templates={ 77 | 'application/json': '{}' 78 | }, 79 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 80 | ) 81 | 82 | list_streams_integration = aws_apigateway.AwsIntegration( 83 | service='kinesis', 84 | action='ListStreams', 85 | integration_http_method='POST', 86 | options=list_streams_options 87 | ) 88 | 89 | streams_resource.add_method("GET", list_streams_integration, 90 | # Default `authorization_type`: - open access unless `authorizer` is specified 91 | authorization_type=aws_apigateway.AuthorizationType.NONE, 92 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 93 | response_models={ 94 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 95 | } 96 | ), 97 | aws_apigateway.MethodResponse(status_code='400'), 98 | aws_apigateway.MethodResponse(status_code='500') 99 | ]) 100 | 101 | #XXX: GET /streams/{stream-name} 102 | # Describe a stream in Kinesis 103 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream 104 | one_stream_resource = streams_resource.add_resource("{stream-name}") 105 | 106 | describe_stream_options = aws_apigateway.IntegrationOptions( 107 | credentials_role=apigw_kds_role, 108 | integration_responses=[ 109 | aws_apigateway.IntegrationResponse( 110 | status_code="200" 111 | ), 112 | *apigw_error_responses 113 | ], 114 | request_templates={ 115 | 'application/json': json.dumps({ 116 | "StreamName": "$input.params('stream-name')" 117 | }, indent=2) 118 | }, 119 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 120 | ) 121 | 122 | describe_stream_integration = aws_apigateway.AwsIntegration( 123 | service='kinesis', 124 | action='DescribeStream', 125 | integration_http_method='POST', 126 | options=describe_stream_options 127 | ) 128 | 129 | one_stream_resource.add_method("GET", describe_stream_integration, 130 | # Default `authorization_type`: - open access unless `authorizer` is specified 131 | authorization_type=aws_apigateway.AuthorizationType.NONE, 132 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 133 | response_models={ 134 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 135 | } 136 | ), 137 | aws_apigateway.MethodResponse(status_code='400'), 138 | aws_apigateway.MethodResponse(status_code='500') 139 | ]) 140 | 141 | #XXX: PUT /streams/{stream-name}/record 142 | # Put a record into a stream in Kinesis 143 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 144 | record_resource = one_stream_resource.add_resource("record") 145 | 146 | put_record_request_mapping_templates = ''' 147 | { 148 | "StreamName": "$input.params('stream-name')", 149 | "Data": "$util.base64Encode($input.json('$.Data'))", 150 | "PartitionKey": "$input.path('$.PartitionKey')" 151 | } 152 | ''' 153 | 154 | put_record_options = aws_apigateway.IntegrationOptions( 155 | credentials_role=apigw_kds_role, 156 | integration_responses=[ 157 | aws_apigateway.IntegrationResponse( 158 | status_code="200" 159 | ), 160 | *apigw_error_responses 161 | ], 162 | request_templates={ 163 | 'application/json': put_record_request_mapping_templates 164 | }, 165 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 166 | ) 167 | 168 | put_record_integration = aws_apigateway.AwsIntegration( 169 | service='kinesis', 170 | action='PutRecord', 171 | integration_http_method='POST', 172 | options=put_record_options 173 | ) 174 | 175 | record_resource.add_method("PUT", put_record_integration, 176 | # Default `authorization_type`: - open access unless `authorizer` is specified 177 | authorization_type=aws_apigateway.AuthorizationType.NONE, 178 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 179 | response_models={ 180 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 181 | } 182 | ), 183 | aws_apigateway.MethodResponse(status_code='400'), 184 | aws_apigateway.MethodResponse(status_code='500') 185 | ]) 186 | 187 | 188 | #XXX: PUT /streams/{stream-name}/records 189 | # Put records into a stream in Kinesis 190 | # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream 191 | records_resource = one_stream_resource.add_resource("records") 192 | 193 | put_records_request_mapping_templates = ''' 194 | { 195 | "StreamName": "$input.params('stream-name')", 196 | "Records": [ 197 | #foreach($elem in $input.path('$.records')) 198 | { 199 | "Data": "$util.base64Encode($elem.data)", 200 | "PartitionKey": "$elem.partition-key" 201 | }#if($foreach.hasNext),#end 202 | #end 203 | ] 204 | } 205 | ''' 206 | 207 | put_records_options = aws_apigateway.IntegrationOptions( 208 | credentials_role=apigw_kds_role, 209 | integration_responses=[ 210 | aws_apigateway.IntegrationResponse( 211 | status_code="200" 212 | ), 213 | *apigw_error_responses 214 | ], 215 | request_templates={ 216 | 'application/json': put_records_request_mapping_templates 217 | }, 218 | passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES 219 | ) 220 | 221 | put_records_integration = aws_apigateway.AwsIntegration( 222 | service='kinesis', 223 | action='PutRecords', 224 | integration_http_method='POST', 225 | options=put_records_options 226 | ) 227 | 228 | records_resource.add_method("PUT", put_records_integration, 229 | # Default `authorization_type`: - open access unless `authorizer` is specified 230 | authorization_type=aws_apigateway.AuthorizationType.NONE, 231 | method_responses=[aws_apigateway.MethodResponse(status_code='200', 232 | response_models={ 233 | 'application/json': aws_apigateway.Model.EMPTY_MODEL 234 | } 235 | ), 236 | aws_apigateway.MethodResponse(status_code='400'), 237 | aws_apigateway.MethodResponse(status_code='500') 238 | ]) 239 | 240 | cdk.CfnOutput(self, 'KdsRestApiName', 241 | value=kds_proxy_api.rest_api_name, 242 | export_name=f'{self.stack_name}-KdsProxyRestApiName') 243 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/athena_named_query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_athena 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class AthenaNamedQueryStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, athena_work_group_name, s3_json_location, s3_parquet_location, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | query_for_json_table = '''/* Create your database */ 20 | CREATE DATABASE IF NOT EXISTS mydatabase; 21 | 22 | /* Create table with partitions */ 23 | CREATE EXTERNAL TABLE `mydatabase.web_log_json`( 24 | `userId` string, 25 | `sessionId` string, 26 | `referrer` string, 27 | `userAgent` string, 28 | `ip` string, 29 | `hostname` string, 30 | `os` string, 31 | `timestamp` timestamp, 32 | `uri` string) 33 | PARTITIONED BY ( 34 | `year` int, 35 | `month` int, 36 | `day` int, 37 | `hour` int) 38 | ROW FORMAT SERDE 39 | 'org.openx.data.jsonserde.JsonSerDe' 40 | STORED AS INPUTFORMAT 41 | 'org.apache.hadoop.mapred.TextInputFormat' 42 | OUTPUTFORMAT 43 | 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' 44 | LOCATION 45 | '{s3_location}'; 46 | 47 | /* Next we will load the partitions for this table */ 48 | MSCK REPAIR TABLE mydatabase.web_log_json; 49 | 50 | /* Check the partitions */ 51 | SHOW PARTITIONS mydatabase.web_log_json; 52 | 53 | SELECT COUNT(*) FROM mydatabase.web_log_json; 54 | '''.format(s3_location=s3_json_location) 55 | 56 | named_query_for_json_table = aws_athena.CfnNamedQuery(self, "MyAthenaCfnNamedQuery1", 57 | database="default", 58 | query_string=query_for_json_table, 59 | 60 | # the properties below are optional 61 | description="Sample Hive DDL statement to create a partitioned table pointing to web log data (json)", 62 | name="Create Web Log table (json) with partitions", 63 | work_group=athena_work_group_name 64 | ) 65 | 66 | query_for_parquet_table = '''/* Create your database */ 67 | CREATE DATABASE IF NOT EXISTS mydatabase; 68 | 69 | /* Create table with partitions */ 70 | CREATE EXTERNAL TABLE `mydatabase.web_log_parquet`( 71 | `userId` string, 72 | `sessionId` string, 73 | `referrer` string, 74 | `userAgent` string, 75 | `ip` string, 76 | `hostname` string, 77 | `os` string, 78 | `timestamp` timestamp, 79 | `uri` string) 80 | PARTITIONED BY ( 81 | `year` int, 82 | `month` int, 83 | `day` int, 84 | `hour` int) 85 | ROW FORMAT SERDE 86 | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 87 | STORED AS INPUTFORMAT 88 | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 89 | OUTPUTFORMAT 90 | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 91 | LOCATION 92 | '{s3_location}'; 93 | 94 | /* Next we will load the partitions for this table */ 95 | MSCK REPAIR TABLE mydatabase.web_log_parquet; 96 | 97 | /* Check the partitions */ 98 | SHOW PARTITIONS mydatabase.web_log_parquet; 99 | 100 | SELECT COUNT(*) FROM mydatabase.web_log_parquet; 101 | '''.format(s3_location=s3_parquet_location) 102 | 103 | named_query_for_parquet_table = aws_athena.CfnNamedQuery(self, "MyAthenaCfnNamedQuery2", 104 | database="default", 105 | query_string=query_for_parquet_table, 106 | 107 | # the properties below are optional 108 | description="Sample Hive DDL statement to create a partitioned table pointing to web log data (parquet)", 109 | name="Create Web Log table (parquet) with partitions", 110 | work_group=athena_work_group_name 111 | ) 112 | 113 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/athena_workgroup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_athena, 10 | aws_s3 as s3, 11 | ) 12 | from constructs import Construct 13 | 14 | 15 | class AthenaWorkGroupStack(Stack): 16 | 17 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 18 | super().__init__(scope, construct_id, **kwargs) 19 | 20 | ATHENA_WORK_GROUP_NAME = self.node.try_get_context('athena_workgroup_name') or 'WebAnalyticsGroup' 21 | 22 | S3_DEFAULT_BUCKET_NAME = 'aws-athena-query-results-{region}-{account_id}'.format( 23 | region=cdk.Aws.REGION, account_id=cdk.Aws.ACCOUNT_ID) 24 | s3_bucket = s3.Bucket(self, "s3bucket", 25 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: core.RemovalPolicy.RETAIN - The bucket will be orphaned 26 | bucket_name=S3_DEFAULT_BUCKET_NAME) 27 | 28 | athena_cfn_work_group = aws_athena.CfnWorkGroup(self, 'AthenaCfnWorkGroup', 29 | name=ATHENA_WORK_GROUP_NAME, 30 | 31 | # the properties below are optional 32 | description='workgroup for developer', 33 | recursive_delete_option=False, 34 | state='ENABLED', # [DISABLED, ENABLED] 35 | tags=[cdk.CfnTag( 36 | key='Name', 37 | value=ATHENA_WORK_GROUP_NAME 38 | )], 39 | work_group_configuration=aws_athena.CfnWorkGroup.WorkGroupConfigurationProperty( 40 | #XXX: EnforceWorkGroupConfiguration 41 | # Link: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-athena-workgroup-workgroupconfiguration.html#cfn-athena-workgroup-workgroupconfiguration-enforceworkgroupconfiguration 42 | # If set to "true", the settings for the workgroup override client-side settings. 43 | # If set to "false", client-side settings are used. 44 | enforce_work_group_configuration=False, 45 | engine_version=aws_athena.CfnWorkGroup.EngineVersionProperty( 46 | effective_engine_version='Athena engine version 3', 47 | selected_engine_version='Athena engine version 3' 48 | ), 49 | publish_cloud_watch_metrics_enabled=True, 50 | requester_pays_enabled=True, 51 | result_configuration=aws_athena.CfnWorkGroup.ResultConfigurationProperty( 52 | output_location=s3_bucket.s3_url_for_object() 53 | ) 54 | ) 55 | ) 56 | 57 | self.athena_work_group_name = athena_cfn_work_group.name 58 | 59 | cdk.CfnOutput(self, 'AthenaWorkGroupName', 60 | value=self.athena_work_group_name, 61 | export_name=f'{self.stack_name}-AthenaWorkGroupName') 62 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/firehose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Stack, 12 | aws_iam, 13 | aws_s3 as s3, 14 | aws_kinesisfirehose, 15 | ) 16 | 17 | from constructs import Construct 18 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn 19 | 20 | random.seed(31) 21 | 22 | 23 | class FirehoseStack(Stack): 24 | 25 | def __init__(self, scope: Construct, construct_id: str, source_kinesis_stream_arn, data_transform_lambda_fn, **kwargs) -> None: 26 | super().__init__(scope, construct_id, **kwargs) 27 | 28 | s3_bucket = s3.Bucket(self, "s3bucket", 29 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned 30 | bucket_name="web-analytics-{region}-{account_id}".format( 31 | region=cdk.Aws.REGION, account_id=cdk.Aws.ACCOUNT_ID)) 32 | 33 | FIREHOSE_DEFAULT_STREAM_NAME = 'PUT-S3-{}'.format(''.join(random.sample((string.ascii_letters), k=5))) 34 | firehose_config = self.node.try_get_context('firehose') 35 | 36 | FIREHOSE_STREAM_NAME = firehose_config.get('stream_name', FIREHOSE_DEFAULT_STREAM_NAME) 37 | FIREHOSE_BUFFER_SIZE = firehose_config['buffer_size_in_mbs'] 38 | FIREHOSE_BUFFER_INTERVAL = firehose_config['buffer_interval_in_seconds'] 39 | FIREHOSE_LAMBDA_BUFFER_SIZE = firehose_config['lambda_buffer_size_in_mbs'] 40 | FIREHOSE_LAMBDA_BUFFER_INTERVAL = firehose_config['lambda_buffer_interval_in_seconds'] 41 | FIREHOSE_LAMBDA_NUMBER_OF_RETRIES = firehose_config['lambda_number_of_retries'] 42 | FIREHOSE_TO_S3_PREFIX = firehose_config['prefix'] 43 | FIREHOSE_TO_S3_ERROR_OUTPUT_PREFIX = firehose_config['error_output_prefix'] 44 | FIREHOSE_TO_S3_OUTPUT_FOLDER = firehose_config['s3_output_folder'] 45 | 46 | assert f'{FIREHOSE_TO_S3_OUTPUT_FOLDER}/' == FIREHOSE_TO_S3_PREFIX[:len(FIREHOSE_TO_S3_OUTPUT_FOLDER) + 1] 47 | 48 | firehose_role_policy_doc = aws_iam.PolicyDocument() 49 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 50 | "effect": aws_iam.Effect.ALLOW, 51 | "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)], 52 | "actions": ["s3:AbortMultipartUpload", 53 | "s3:GetBucketLocation", 54 | "s3:GetObject", 55 | "s3:ListBucket", 56 | "s3:ListBucketMultipartUploads", 57 | "s3:PutObject"] 58 | })) 59 | 60 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 61 | effect=aws_iam.Effect.ALLOW, 62 | resources=["*"], 63 | actions=["ec2:DescribeVpcs", 64 | "ec2:DescribeVpcAttribute", 65 | "ec2:DescribeSubnets", 66 | "ec2:DescribeSecurityGroups", 67 | "ec2:DescribeNetworkInterfaces", 68 | "ec2:CreateNetworkInterface", 69 | "ec2:CreateNetworkInterfacePermission", 70 | "ec2:DeleteNetworkInterface"] 71 | )) 72 | 73 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 74 | effect=aws_iam.Effect.ALLOW, 75 | resources=["*"], 76 | actions=["glue:GetTable", 77 | "glue:GetTableVersion", 78 | "glue:GetTableVersions"] 79 | )) 80 | 81 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 82 | effect=aws_iam.Effect.ALLOW, 83 | resources=[source_kinesis_stream_arn], 84 | actions=["kinesis:DescribeStream", 85 | "kinesis:GetShardIterator", 86 | "kinesis:GetRecords"] 87 | )) 88 | 89 | firehose_log_group_name = f"/aws/kinesisfirehose/{FIREHOSE_STREAM_NAME}" 90 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement( 91 | effect=aws_iam.Effect.ALLOW, 92 | #XXX: The ARN will be formatted as follows: 93 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name} 94 | resources=[self.format_arn(service="logs", resource="log-group", 95 | resource_name="{}:log-stream:*".format(firehose_log_group_name), 96 | arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)], 97 | actions=["logs:PutLogEvents"] 98 | )) 99 | 100 | firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 101 | "effect": aws_iam.Effect.ALLOW, 102 | #XXX: The ARN will be formatted as follows: 103 | # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name} 104 | "resources": [self.format_arn(partition="aws", service="lambda", 105 | region=cdk.Aws.REGION, account=cdk.Aws.ACCOUNT_ID, resource="function", 106 | resource_name="{}:*".format(data_transform_lambda_fn.function_name), 107 | arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)], 108 | "actions": ["lambda:InvokeFunction", 109 | "lambda:GetFunctionConfiguration"] 110 | })) 111 | 112 | firehose_role = aws_iam.Role(self, "KinesisFirehoseDeliveryRole", 113 | role_name="KinesisFirehoseServiceRole-{stream_name}-{region}".format( 114 | stream_name=FIREHOSE_STREAM_NAME, region=cdk.Aws.REGION), 115 | assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"), 116 | #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221 117 | inline_policies={ 118 | "firehose_role_policy": firehose_role_policy_doc 119 | } 120 | ) 121 | 122 | lambda_proc = cfn.ProcessorProperty( 123 | type="Lambda", 124 | parameters=[ 125 | cfn.ProcessorParameterProperty( 126 | parameter_name="LambdaArn", 127 | # parameter_value='{}:{}'.format(schema_validator_lambda_fn.function_arn, schema_validator_lambda_fn.current_version.version) 128 | parameter_value='{}:{}'.format(data_transform_lambda_fn.function_arn, data_transform_lambda_fn.latest_version.version) 129 | ), 130 | cfn.ProcessorParameterProperty( 131 | parameter_name="NumberOfRetries", 132 | parameter_value=str(FIREHOSE_LAMBDA_NUMBER_OF_RETRIES) 133 | ), 134 | cfn.ProcessorParameterProperty( 135 | parameter_name="RoleArn", 136 | parameter_value=firehose_role.role_arn 137 | ), 138 | cfn.ProcessorParameterProperty( 139 | parameter_name="BufferSizeInMBs", 140 | parameter_value=str(FIREHOSE_LAMBDA_BUFFER_SIZE) 141 | ), 142 | cfn.ProcessorParameterProperty( 143 | parameter_name="BufferIntervalInSeconds", 144 | parameter_value=str(FIREHOSE_LAMBDA_BUFFER_INTERVAL) 145 | ) 146 | ] 147 | ) 148 | 149 | firehose_processing_config = cfn.ProcessingConfigurationProperty( 150 | enabled=True, 151 | processors=[ 152 | lambda_proc 153 | ] 154 | ) 155 | 156 | ext_s3_dest_config = cfn.ExtendedS3DestinationConfigurationProperty( 157 | bucket_arn=s3_bucket.bucket_arn, 158 | role_arn=firehose_role.role_arn, 159 | buffering_hints={ 160 | "intervalInSeconds": FIREHOSE_BUFFER_INTERVAL, 161 | "sizeInMBs": FIREHOSE_BUFFER_SIZE 162 | }, 163 | cloud_watch_logging_options={ 164 | "enabled": True, 165 | "logGroupName": firehose_log_group_name, 166 | "logStreamName": "S3Delivery" 167 | }, 168 | compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP] 169 | data_format_conversion_configuration={ 170 | "enabled": False 171 | }, 172 | dynamic_partitioning_configuration={ 173 | "enabled": False 174 | }, 175 | error_output_prefix=FIREHOSE_TO_S3_ERROR_OUTPUT_PREFIX, 176 | prefix=FIREHOSE_TO_S3_PREFIX, 177 | processing_configuration=firehose_processing_config 178 | ) 179 | 180 | firehose_to_s3_delivery_stream = aws_kinesisfirehose.CfnDeliveryStream(self, "KinesisFirehoseToS3", 181 | delivery_stream_name=FIREHOSE_STREAM_NAME, 182 | delivery_stream_type="KinesisStreamAsSource", 183 | kinesis_stream_source_configuration={ 184 | "kinesisStreamArn": source_kinesis_stream_arn, 185 | "roleArn": firehose_role.role_arn 186 | }, 187 | extended_s3_destination_configuration=ext_s3_dest_config 188 | ) 189 | 190 | self.s3_dest_bucket_name = s3_bucket.bucket_name 191 | self.s3_dest_folder_name = FIREHOSE_TO_S3_OUTPUT_FOLDER 192 | 193 | cdk.CfnOutput(self, 'S3DestBucket', 194 | value=s3_bucket.bucket_name, 195 | export_name=f'{self.stack_name}-S3DestBucket') 196 | cdk.CfnOutput(self, 'KinesisDataFirehoseName', 197 | value=firehose_to_s3_delivery_stream.delivery_stream_name, 198 | export_name=f'{self.stack_name}-KinesisDataFirehoseName') 199 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/firehose_dtata_transform_lambda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_lambda, 10 | aws_logs, 11 | aws_s3 as s3 12 | ) 13 | from constructs import Construct 14 | 15 | 16 | class FirehoseDataTransformLambdaStack(Stack): 17 | 18 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 19 | super().__init__(scope, construct_id, **kwargs) 20 | 21 | firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda') 22 | LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name'] 23 | LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key'] 24 | 25 | s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET) 26 | lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib", 27 | layer_version_name="fastavro-lib", 28 | compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11], 29 | code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY) 30 | ) 31 | 32 | SCHEMA_VALIDATOR_LAMBDA_FN_NAME = "SchemaValidator" 33 | schema_validator_lambda_fn = aws_lambda.Function(self, "SchemaValidator", 34 | runtime=aws_lambda.Runtime.PYTHON_3_11, 35 | function_name=SCHEMA_VALIDATOR_LAMBDA_FN_NAME, 36 | handler="schema_validator.lambda_handler", 37 | description="Check if records have valid schema", 38 | code=aws_lambda.Code.from_asset('./src/main/python/SchemaValidator'), 39 | timeout=cdk.Duration.minutes(5), 40 | #XXX: set memory size appropriately 41 | memory_size=256, 42 | layers=[lambda_lib_layer] 43 | ) 44 | 45 | log_group = aws_logs.LogGroup(self, "SchemaValidatorLogGroup", 46 | #XXX: Circular dependency between resources occurs 47 | # if aws_lambda.Function.function_name is used 48 | # instead of literal name of lambda function such as "SchemaValidator" 49 | log_group_name="/aws/lambda/{}".format(SCHEMA_VALIDATOR_LAMBDA_FN_NAME), 50 | retention=aws_logs.RetentionDays.THREE_DAYS, 51 | removal_policy=cdk.RemovalPolicy.DESTROY 52 | ) 53 | log_group.grant_write(schema_validator_lambda_fn) 54 | 55 | self.schema_validator_lambda_fn = schema_validator_lambda_fn 56 | 57 | cdk.CfnOutput(self, 'FirehoseDataTransformFuncName', 58 | value=self.schema_validator_lambda_fn.function_name, 59 | export_name=f'{self.stack_name}-FirehoseDataTransformFuncName') 60 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/glue_catalog_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_glue 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class GlueCatalogDatabaseStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | athena_database_info = self.node.try_get_context('merge_small_files_lambda_env') 20 | old_database_name = athena_database_info['OLD_DATABASE'] 21 | new_database_name = athena_database_info['NEW_DATABASE'] 22 | 23 | for idx, database_name in enumerate(list(set([old_database_name, new_database_name]))): 24 | cfn_database = aws_glue.CfnDatabase(self, f"GlueCfnDatabase{idx}", 25 | catalog_id=cdk.Aws.ACCOUNT_ID, 26 | database_input=aws_glue.CfnDatabase.DatabaseInputProperty( 27 | name=database_name 28 | ) 29 | ) 30 | cfn_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 31 | 32 | cdk.CfnOutput(self, f'GlueDatabaseName{idx}', 33 | value=cfn_database.database_input.name, 34 | export_name=f'{self.stack_name}-GlueDatabaseName{idx}') 35 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/kds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Duration, 12 | Stack, 13 | aws_kinesis, 14 | ) 15 | from constructs import Construct 16 | 17 | random.seed(31) 18 | 19 | 20 | class KdsStack(Stack): 21 | 22 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | KINESIS_DEFAULT_STREAM_NAME = 'PUT-Firehose-{}'.format(''.join(random.sample((string.ascii_letters), k=5))) 26 | KINESIS_STREAM_NAME = self.node.try_get_context('kinesis_stream_name') or KINESIS_DEFAULT_STREAM_NAME 27 | 28 | source_kinesis_stream = aws_kinesis.Stream(self, "SourceKinesisStreams", 29 | retention_period=Duration.hours(24), 30 | stream_mode=aws_kinesis.StreamMode.ON_DEMAND, 31 | stream_name=KINESIS_STREAM_NAME) 32 | 33 | self.target_kinesis_stream = source_kinesis_stream 34 | 35 | cdk.CfnOutput(self, 'KinesisDataStreamName', 36 | value=self.target_kinesis_stream.stream_name, 37 | export_name=f'{self.stack_name}-KinesisDataStreamName') 38 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/lake_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import aws_cdk as cdk 6 | 7 | from aws_cdk import ( 8 | Stack, 9 | aws_lakeformation 10 | ) 11 | from constructs import Construct 12 | 13 | 14 | class DataLakePermissionsStack(Stack): 15 | 16 | def __init__(self, scope: Construct, construct_id: str, job_role, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | athena_database_info = self.node.try_get_context('merge_small_files_lambda_env') 20 | old_database_name = athena_database_info['OLD_DATABASE'] 21 | new_database_name = athena_database_info['NEW_DATABASE'] 22 | 23 | database_list = list(set([old_database_name, new_database_name])) 24 | 25 | #XXXX: The role assumed by cdk is not a data lake administrator. 26 | # So, deploying PrincipalPermissions meets the error such as: 27 | # "Resource does not exist or requester is not authorized to access requested permissions." 28 | # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator. 29 | # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68 30 | cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings", 31 | admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty( 32 | data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn) 33 | )] 34 | ) 35 | 36 | for idx, database_name in enumerate(database_list): 37 | lf_permissions_on_database = aws_lakeformation.CfnPrincipalPermissions(self, f"LFPermissionsOnDatabase{idx}", 38 | permissions=["CREATE_TABLE", "DROP", "ALTER", "DESCRIBE"], 39 | permissions_with_grant_option=[], 40 | principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty( 41 | data_lake_principal_identifier=job_role.role_arn 42 | ), 43 | resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty( 44 | database=aws_lakeformation.CfnPrincipalPermissions.DatabaseResourceProperty( 45 | catalog_id=cdk.Aws.ACCOUNT_ID, 46 | name=database_name 47 | ) 48 | ) 49 | ) 50 | lf_permissions_on_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 51 | 52 | #XXX: In order to keep resource destruction order, 53 | # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions 54 | lf_permissions_on_database.add_dependency(cfn_data_lake_settings) 55 | 56 | for idx, database_name in enumerate(database_list): 57 | lf_permissions_on_table = aws_lakeformation.CfnPrincipalPermissions(self, f"LFPermissionsOnTable{idx}", 58 | permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"], 59 | permissions_with_grant_option=[], 60 | principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty( 61 | data_lake_principal_identifier=job_role.role_arn 62 | ), 63 | resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty( 64 | #XXX: Can't specify a TableWithColumns resource and a Table resource 65 | table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty( 66 | catalog_id=cdk.Aws.ACCOUNT_ID, 67 | database_name=database_name, 68 | table_wildcard={} 69 | ) 70 | ) 71 | ) 72 | lf_permissions_on_table.apply_removal_policy(cdk.RemovalPolicy.DESTROY) 73 | lf_permissions_on_table.add_dependency(cfn_data_lake_settings) 74 | 75 | -------------------------------------------------------------------------------- /web-analytics-parquet/cdk_stacks/merge_small_files_lambda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from aws_cdk import ( 10 | Stack, 11 | aws_iam, 12 | aws_lambda, 13 | aws_logs, 14 | aws_events, 15 | aws_events_targets 16 | ) 17 | from constructs import Construct 18 | 19 | 20 | class MergeSmallFilesLambdaStack(Stack): 21 | 22 | def __init__(self, scope: Construct, construct_id: str, s3_bucket_name, s3_folder_name, athena_work_group, **kwargs) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | _lambda_env = self.node.try_get_context('merge_small_files_lambda_env') 26 | 27 | LAMBDA_ENV_VARS = [ 28 | 'OLD_DATABASE', 29 | 'OLD_TABLE_NAME', 30 | 'NEW_DATABASE', 31 | 'NEW_TABLE_NAME', 32 | 'ATHENA_WORK_GROUP', 33 | 'OLD_TABLE_LOCATION_PREFIX', 34 | 'OUTPUT_PREFIX', 35 | 'STAGING_OUTPUT_PREFIX', 36 | 'COLUMN_NAMES' 37 | ] 38 | 39 | lambda_fn_env = {k: v for k, v in _lambda_env.items() if k in LAMBDA_ENV_VARS} 40 | additional_lambda_fn_env = { 41 | 'ATHENA_WORK_GROUP': athena_work_group, 42 | 'OLD_TABLE_LOCATION_PREFIX': f"s3://{os.path.join(s3_bucket_name, s3_folder_name)}", 43 | 'OUTPUT_PREFIX': f"s3://{os.path.join(s3_bucket_name, _lambda_env['NEW_TABLE_S3_FOLDER_NAME'])}", 44 | 'STAGING_OUTPUT_PREFIX': f"s3://{os.path.join(s3_bucket_name, 'tmp')}", 45 | 'REGION_NAME': cdk.Aws.REGION 46 | } 47 | lambda_fn_env.update(additional_lambda_fn_env) 48 | 49 | self.s3_json_location, self.s3_parquet_location = (lambda_fn_env['OLD_TABLE_LOCATION_PREFIX'], lambda_fn_env['OUTPUT_PREFIX']) 50 | 51 | merge_small_files_lambda_fn = aws_lambda.Function(self, "MergeSmallFiles", 52 | runtime=aws_lambda.Runtime.PYTHON_3_11, 53 | function_name="MergeSmallFiles", 54 | handler="athena_ctas.lambda_handler", 55 | description="Merge small files in S3", 56 | code=aws_lambda.Code.from_asset('./src/main/python/MergeSmallFiles'), 57 | environment=lambda_fn_env, 58 | timeout=cdk.Duration.minutes(5) 59 | ) 60 | 61 | merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement( 62 | effect=aws_iam.Effect.ALLOW, 63 | resources=["*"], 64 | actions=["athena:*"])) 65 | 66 | merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement( 67 | effect=aws_iam.Effect.ALLOW, 68 | resources=["*"], 69 | actions=["s3:Get*", 70 | "s3:List*", 71 | "s3:AbortMultipartUpload", 72 | "s3:PutObject", 73 | ])) 74 | 75 | merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement( 76 | effect=aws_iam.Effect.ALLOW, 77 | resources=["*"], 78 | actions=["glue:CreateDatabase", 79 | "glue:DeleteDatabase", 80 | "glue:GetDatabase", 81 | "glue:GetDatabases", 82 | "glue:UpdateDatabase", 83 | "glue:CreateTable", 84 | "glue:DeleteTable", 85 | "glue:BatchDeleteTable", 86 | "glue:UpdateTable", 87 | "glue:GetTable", 88 | "glue:GetTables", 89 | "glue:BatchCreatePartition", 90 | "glue:CreatePartition", 91 | "glue:DeletePartition", 92 | "glue:BatchDeletePartition", 93 | "glue:UpdatePartition", 94 | "glue:GetPartition", 95 | "glue:GetPartitions", 96 | "glue:BatchGetPartition" 97 | ])) 98 | 99 | merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement( 100 | effect=aws_iam.Effect.ALLOW, 101 | resources=["*"], 102 | actions=["lakeformation:GetDataAccess"])) 103 | 104 | lambda_fn_target = aws_events_targets.LambdaFunction(merge_small_files_lambda_fn) 105 | aws_events.Rule(self, "ScheduleRule", 106 | schedule=aws_events.Schedule.cron(minute="10"), 107 | targets=[lambda_fn_target] 108 | ) 109 | 110 | log_group = aws_logs.LogGroup(self, "MergeSmallFilesLogGroup", 111 | log_group_name=f"/aws/lambda/{self.stack_name}/MergeSmallFiles", 112 | removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: for testing 113 | retention=aws_logs.RetentionDays.THREE_DAYS) 114 | log_group.grant_write(merge_small_files_lambda_fn) 115 | 116 | self.lambda_exec_role = merge_small_files_lambda_fn.role 117 | 118 | 119 | cdk.CfnOutput(self, 'MergeFilesFuncName', 120 | value=merge_small_files_lambda_fn.function_name, 121 | export_name=f'{self.stack_name}-MergeFilesLambdaFuncName') 122 | cdk.CfnOutput(self, 'LambdaExecRoleArn', 123 | value=self.lambda_exec_role.role_arn, 124 | export_name=f'{self.stack_name}-LambdaExecRoleArn') 125 | -------------------------------------------------------------------------------- /web-analytics-parquet/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.24.41 2 | mimesis==18.0.0 3 | requests>=2.31.0 4 | 5 | # packages for Lambda Layer 6 | fastavro==1.10.0 7 | -------------------------------------------------------------------------------- /web-analytics-parquet/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.139.1 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /web-analytics-parquet/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /web-analytics-parquet/src/main/python/MergeSmallFiles/athena_ctas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import sys 6 | import os 7 | import datetime 8 | import time 9 | import random 10 | 11 | import boto3 12 | 13 | random.seed(47) 14 | 15 | DRY_RUN = (os.getenv('DRY_RUN', 'false').lower() == 'true') 16 | AWS_REGION = os.getenv('REGION_NAME', 'us-east-1') 17 | 18 | OLD_DATABASE = os.getenv('OLD_DATABASE') 19 | OLD_TABLE_NAME = os.getenv('OLD_TABLE_NAME') 20 | NEW_DATABASE = os.getenv('NEW_DATABASE') 21 | NEW_TABLE_NAME = os.getenv('NEW_TABLE_NAME') 22 | WORK_GROUP = os.getenv('ATHENA_WORK_GROUP', 'primary') 23 | OLD_TABLE_LOCATION_PREFIX = os.getenv('OLD_TABLE_LOCATION_PREFIX') 24 | OUTPUT_PREFIX = os.getenv('OUTPUT_PREFIX') 25 | STAGING_OUTPUT_PREFIX = os.getenv('STAGING_OUTPUT_PREFIX') 26 | COLUMN_NAMES = os.getenv('COLUMN_NAMES', '*') 27 | 28 | EXTERNAL_LOCATION_FMT = '''{output_prefix}/year={year}/month={month:02}/day={day:02}/hour={hour:02}/''' 29 | 30 | CTAS_QUERY_FMT = '''CREATE TABLE {new_database}.tmp_{new_table_name} 31 | WITH ( 32 | external_location='{location}', 33 | format = 'PARQUET', 34 | parquet_compression = 'SNAPPY') 35 | AS SELECT {columns} 36 | FROM {old_database}.{old_table_name} 37 | WHERE year={year} AND month={month} AND day={day} AND hour={hour} 38 | WITH DATA 39 | ''' 40 | 41 | def run_alter_table_add_partition(athena_client, basic_dt, database_name, table_name, output_prefix): 42 | year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour) 43 | 44 | tmp_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=table_name, 45 | year=year, month=month, day=day, hour=hour) 46 | 47 | output_location = '{}/alter_table_{}'.format(STAGING_OUTPUT_PREFIX, tmp_table_name) 48 | 49 | alter_table_stmt = '''ALTER TABLE {database}.{table_name} ADD if NOT EXISTS'''.format(database=database_name, 50 | table_name=table_name) 51 | 52 | partition_expr = '''PARTITION (year={year}, month={month}, day={day}, hour={hour}) LOCATION "{output_prefix}/year={year}/month={month:02}/day={day:02}/hour={hour:02}/"''' 53 | 54 | partition_expr_list = [] 55 | for i in (1, 0, -1): 56 | dt = basic_dt - datetime.timedelta(hours=i) 57 | year, month, day, hour = (dt.year, dt.month, dt.day, dt.hour) 58 | part_expr = partition_expr.format(year=year, month=month, day=day, hour=hour, output_prefix=output_prefix) 59 | partition_expr_list.append(part_expr) 60 | 61 | query = '{} {}'.format(alter_table_stmt, '\n'.join(partition_expr_list)) 62 | print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr) 63 | print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr) 64 | 65 | if DRY_RUN: 66 | print('[INFO] End of dry-run', file=sys.stderr) 67 | return 68 | 69 | response = athena_client.start_query_execution( 70 | QueryString=query, 71 | ResultConfiguration={ 72 | 'OutputLocation': output_location 73 | }, 74 | WorkGroup=WORK_GROUP 75 | ) 76 | print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr) 77 | 78 | 79 | def run_drop_tmp_table(athena_client, basic_dt): 80 | year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour) 81 | 82 | tmp_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=NEW_TABLE_NAME, 83 | year=year, month=month, day=day, hour=hour) 84 | 85 | output_location = '{}/tmp_{}'.format(STAGING_OUTPUT_PREFIX, tmp_table_name) 86 | query = 'DROP TABLE IF EXISTS {database}.tmp_{table_name}'.format(database=NEW_DATABASE, 87 | table_name=tmp_table_name) 88 | 89 | print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr) 90 | print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr) 91 | 92 | if DRY_RUN: 93 | print('[INFO] End of dry-run', file=sys.stderr) 94 | return 95 | 96 | response = athena_client.start_query_execution( 97 | QueryString=query, 98 | ResultConfiguration={ 99 | 'OutputLocation': output_location 100 | }, 101 | WorkGroup=WORK_GROUP 102 | ) 103 | print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr) 104 | 105 | 106 | def run_ctas(athena_client, basic_dt): 107 | year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour) 108 | 109 | new_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=NEW_TABLE_NAME, 110 | year=year, month=month, day=day, hour=hour) 111 | 112 | output_location = '{}/tmp_{}'.format(STAGING_OUTPUT_PREFIX, new_table_name) 113 | external_location = EXTERNAL_LOCATION_FMT.format(output_prefix=OUTPUT_PREFIX, 114 | year=year, month=month, day=day, hour=hour) 115 | 116 | query = CTAS_QUERY_FMT.format(new_database=NEW_DATABASE, new_table_name=new_table_name, 117 | old_database=OLD_DATABASE, old_table_name=OLD_TABLE_NAME, columns=COLUMN_NAMES, 118 | year=year, month=month, day=day, hour=hour, location=external_location) 119 | 120 | print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr) 121 | print('[INFO] ExternalLocation: {}'.format(external_location), file=sys.stderr) 122 | print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr) 123 | 124 | if DRY_RUN: 125 | print('[INFO] End of dry-run', file=sys.stderr) 126 | return 127 | 128 | response = athena_client.start_query_execution( 129 | QueryString=query, 130 | QueryExecutionContext={ 131 | 'Database': NEW_DATABASE 132 | }, 133 | ResultConfiguration={ 134 | 'OutputLocation': output_location 135 | }, 136 | WorkGroup=WORK_GROUP 137 | ) 138 | print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr) 139 | 140 | 141 | def lambda_handler(event, context): 142 | event_dt = datetime.datetime.strptime(event['time'], "%Y-%m-%dT%H:%M:%SZ") 143 | prev_basic_dt, basic_dt = [event_dt - datetime.timedelta(hours=e) for e in (2, 1)] 144 | 145 | client = boto3.client('athena', region_name=AWS_REGION) 146 | run_drop_tmp_table(client, prev_basic_dt) 147 | 148 | if not DRY_RUN: 149 | print('[INFO] Wait for a few seconds until dropping old table', file=sys.stderr) 150 | time.sleep(10) 151 | 152 | run_alter_table_add_partition(client, basic_dt, 153 | database_name=OLD_DATABASE, 154 | table_name=OLD_TABLE_NAME, 155 | output_prefix=OLD_TABLE_LOCATION_PREFIX) 156 | 157 | if not DRY_RUN: 158 | print('[INFO] Wait for a few seconds until adding partitions to table: %s.%s' % (OLD_DATABASE, OLD_TABLE_NAME), file=sys.stderr) 159 | time.sleep(10) 160 | 161 | run_alter_table_add_partition(client, basic_dt, 162 | database_name=NEW_DATABASE, 163 | table_name=NEW_TABLE_NAME, 164 | output_prefix=OUTPUT_PREFIX) 165 | 166 | if not DRY_RUN: 167 | print('[INFO] Wait for a few seconds until adding partitions to table: %s.%s' % (NEW_DATABASE, NEW_TABLE_NAME), file=sys.stderr) 168 | time.sleep(10) 169 | 170 | run_ctas(client, basic_dt) 171 | 172 | 173 | if __name__ == '__main__': 174 | import argparse 175 | 176 | parser = argparse.ArgumentParser() 177 | parser.add_argument('-dt', '--basic-datetime', default=datetime.datetime.today().strftime('%Y-%m-%dT%H:05:00Z'), 178 | help='The scheduled event occurrence time ex) 2020-02-28T03:05:00Z') 179 | parser.add_argument('--region-name', default='us-east-1', 180 | help='aws region name') 181 | parser.add_argument('--old-database', default='mydatabase', 182 | help='aws athena source database name used by ctas query') 183 | parser.add_argument('--old-table-name', default='web_log_json', 184 | help='aws athena source table name used by ctas query') 185 | parser.add_argument('--new-database', default='mydatabase', 186 | help='aws athena target database name for merged files') 187 | parser.add_argument('--new-table-name', default='ctas_web_log_parquet', 188 | help='aws athena target table name for merged files') 189 | parser.add_argument('--work-group', default='primary', 190 | help='aws athena work group') 191 | parser.add_argument('--old-table-location-prefix', required=True, 192 | help='s3 path for aws athena source table') 193 | parser.add_argument('--output-prefix', required=True, 194 | help='s3 path for aws athena target table') 195 | parser.add_argument('--staging-output-prefix', required=True, 196 | help='s3 path for aws athena tmp table') 197 | parser.add_argument('--column-names', default='*', 198 | help='selectable column names of aws athena source table') 199 | parser.add_argument('--run', action='store_true', 200 | help='run ctas query') 201 | 202 | options = parser.parse_args() 203 | 204 | DRY_RUN = False if options.run else True 205 | AWS_REGION = options.region_name 206 | OLD_DATABASE = options.old_database 207 | OLD_TABLE_NAME= options.old_table_name 208 | NEW_DATABASE = options.new_database 209 | NEW_TABLE_NAME = options.new_table_name 210 | WORK_GROUP = options.work_group 211 | OLD_TABLE_LOCATION_PREFIX = options.old_table_location_prefix 212 | OUTPUT_PREFIX = options.output_prefix 213 | STAGING_OUTPUT_PREFIX = options.staging_output_prefix 214 | COLUMN_NAMES = options.column_names 215 | 216 | event = { 217 | "id": "cdc73f9d-aea9-11e3-9d5a-835b769c0d9c", 218 | "detail-type": "Scheduled Event", 219 | "source": "aws.events", 220 | "account": "123456789012", 221 | "time": options.basic_datetime, # ex) "2020-02-28T03:05:00Z" 222 | "region": AWS_REGION, # ex) "us-east-1", 223 | "resources": [ 224 | f"arn:aws:events:{AWS_REGION}:123456789012:rule/ExampleRule" 225 | ], 226 | "detail": {} 227 | } 228 | print('[DEBUG] event:\n{}'.format(event), file=sys.stderr) 229 | lambda_handler(event, {}) 230 | -------------------------------------------------------------------------------- /web-analytics-parquet/src/main/python/SchemaValidator/schema_validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import base64 6 | import json 7 | import logging 8 | import collections 9 | from datetime import datetime 10 | 11 | import fastavro 12 | 13 | LOGGER = logging.getLogger() 14 | if len(LOGGER.handlers) > 0: 15 | # The Lambda environment pre-configures a handler logging to stderr. 16 | # If a handler is already configured, `.basicConfig` does not execute. 17 | # Thus we set the level directly. 18 | LOGGER.setLevel(logging.INFO) 19 | else: 20 | logging.basicConfig(level=logging.INFO) 21 | 22 | 23 | ORIGINAL_SCHEMA = { 24 | 'name': 'WebLogs', 25 | 'type': 'record', 26 | 'fields': [ 27 | { 28 | 'name': 'userId', 29 | 'type': 'string' 30 | }, 31 | { 32 | 'name': 'sessionId', 33 | 'type': 'string' 34 | }, 35 | { 36 | 'name': 'referrer', 37 | 'type': ['string', 'null'] 38 | }, 39 | { 40 | 'name': 'userAgent', 41 | 'type': ['string', 'null'] 42 | }, 43 | { 44 | 'name': 'ip', 45 | 'type': 'string' 46 | }, 47 | { 48 | 'name': 'hostname', 49 | 'type': 'string' 50 | }, 51 | { 52 | 'name': 'os', 53 | 'type': ['string', 'null'] 54 | }, 55 | { 56 | 'name': 'timestamp', 57 | 'type': { 58 | 'type': 'string', 59 | 'logicalType': 'datetime' 60 | } 61 | }, 62 | { 63 | 'name': 'uri', 64 | 'type': 'string' 65 | } 66 | ] 67 | } 68 | 69 | 70 | def read_datetime(data, writer_schema=None, reader_schema=None): 71 | return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 72 | 73 | def prepare_datetime(data, schema): 74 | """Converts datetime.datetime to string representing the date and time""" 75 | if isinstance(data, datetime): 76 | return datetime.strftime('%Y-%m-%dT%H:%M:%SZ') 77 | else: 78 | try: 79 | dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ') 80 | return dt.strftime('%Y-%m-%dT%H:%M:%SZ') 81 | except Exception as ex: 82 | return None 83 | 84 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime 85 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime 86 | 87 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA) 88 | 89 | def check_schema(record): 90 | try: 91 | return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False) 92 | except Exception as ex: 93 | LOGGER.error(ex) 94 | return False 95 | 96 | # Signature for all Lambda functions that user must implement 97 | def lambda_handler(firehose_records_input, context): 98 | LOGGER.debug("Received records for processing from DeliveryStream: {deliveryStreamArn}, Region: {region}, and InvocationId: {invocationId}".format( 99 | deliveryStreamArn=firehose_records_input['deliveryStreamArn'], 100 | region=firehose_records_input['region'], 101 | invocationId=firehose_records_input['invocationId'])) 102 | 103 | # Create return value. 104 | firehose_records_output = {'records': []} 105 | 106 | counter = collections.Counter(total=0, valid=0, invalid=0) 107 | 108 | # Create result object. 109 | # Go through records and process them 110 | for firehose_record_input in firehose_records_input['records']: 111 | counter['total'] += 1 112 | 113 | # Get user payload 114 | payload = base64.b64decode(firehose_record_input['data']) 115 | json_value = json.loads(payload) 116 | 117 | LOGGER.debug("Record that was received: {}".format(json_value)) 118 | 119 | #TODO: check if schema is valid 120 | is_valid = check_schema(json_value) 121 | counter['valid' if is_valid else 'invalid'] += 1 122 | 123 | # Create output Firehose record and add modified payload and record ID to it. 124 | firehose_record_output = { 125 | 'recordId': firehose_record_input['recordId'], 126 | #XXX: convert JSON to JSONLine 127 | 'data': base64.b64encode(payload.rstrip(b'\n') + b'\n'), 128 | 129 | # The status of the data transformation of the record. 130 | # The possible values are: 131 | # Ok (the record was transformed successfully), 132 | # Dropped (the record was dropped intentionally by your processing logic), 133 | # and ProcessingFailed (the record could not be transformed). 134 | # If a record has a status of Ok or Dropped, Kinesis Data Firehose considers it successfully processed. 135 | # Otherwise, Kinesis Data Firehose considers it unsuccessfully processed. 136 | 137 | # 'ProcessFailed' record will be put into error bucket in S3 138 | 'result': 'Ok' if is_valid else 'ProcessingFailed' # [Ok, Dropped, ProcessingFailed] 139 | } 140 | 141 | # Must set proper record ID 142 | # Add the record to the list of output records. 143 | firehose_records_output['records'].append(firehose_record_output) 144 | 145 | LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items())) 146 | 147 | # At the end return processed records 148 | return firehose_records_output 149 | 150 | 151 | if __name__ == '__main__': 152 | import pprint 153 | 154 | record_list = [ 155 | { 156 | "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720", 157 | "sessionId": "a5aa20a72c9e37588f9bbeaa", 158 | "referrer": "brandon.biz", 159 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 160 | "ip": "202.165.71.49", 161 | "hostname": "toxic.tokyo", 162 | "os": "openSUSE", 163 | "timestamp": "2022-09-16T07:35:46Z", 164 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 165 | }, 166 | { 167 | "userId": "70b1f606-aa63-47fb-bc92-76de9c59d064", 168 | "sessionId": "928e78473db8449b17644b2c", 169 | # missing optional data 170 | # "referrer": "toe.gq", 171 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53", 172 | "ip": "12.166.113.176", 173 | "hostname": "drivers.glass", 174 | "os": "Windows 8.1", 175 | "timestamp": "2022-09-16T07:52:47Z", 176 | "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members" 177 | }, 178 | { 179 | "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720", 180 | "sessionId": "a5aa20a72c9e37588f9bbeaa", 181 | "referrer": "brandon.biz", 182 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52", 183 | "ip": "202.165.71.49", 184 | "hostname": "toxic.tokyo", 185 | "os": "openSUSE", 186 | # invalid datetime format 187 | "timestamp": "2022-09-16 07:35:46", 188 | "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories" 189 | }, 190 | { 191 | # missing required data 192 | # "userId": "045e63c7-b276-4117-9706-7c2e3b87d5f5", 193 | "sessionId": "abfd47eb7dd7b8aeec0555a7", 194 | "referrer": "transfer.edu", 195 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50", 196 | "ip": "170.128.148.234", 197 | "hostname": "propecia.tc", 198 | "os": "Lubuntu", 199 | "timestamp": "2022-09-16T07:46:04Z", 200 | "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel" 201 | }, 202 | { 203 | "userId": "e504cd9d-30da-497f-8f28-2b3f64220e16", 204 | "sessionId": "fd4807ab825ee8bd950b1e8b", 205 | "referrer": "liquid.aquitaine", 206 | "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02", 207 | # mismatched data type 208 | "ip": 212234672, 209 | "hostname": "consequently.com", 210 | "os": "Gentoo", 211 | "timestamp": "2022-09-16T07:13:29Z", 212 | "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook" 213 | } 214 | ] 215 | 216 | for record in record_list: 217 | event = { 218 | "invocationId": "invocationIdExample", 219 | "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE", 220 | "region": "us-east-1", 221 | "records": [ 222 | { 223 | "recordId": "49546986683135544286507457936321625675700192471156785154", 224 | "approximateArrivalTimestamp": 1495072949453, 225 | "data": base64.b64encode(json.dumps(record).encode('utf-8')) 226 | } 227 | ] 228 | } 229 | 230 | res = lambda_handler(event, {}) 231 | for elem in res['records']: 232 | print(f"[{elem['result']}]") 233 | print(base64.b64decode(elem['data']).decode('utf-8')) 234 | 235 | -------------------------------------------------------------------------------- /web-analytics-parquet/src/utils/gen_fake_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import sys 6 | import argparse 7 | from datetime import ( 8 | datetime, 9 | timezone 10 | ) 11 | import json 12 | import time 13 | import typing 14 | 15 | from mimesis.locales import Locale 16 | from mimesis.schema import Field, Schema 17 | from mimesis.providers.base import BaseProvider 18 | import requests 19 | 20 | 21 | class CustomDatetime(BaseProvider): 22 | class Meta: 23 | """Class for metadata.""" 24 | name: typing.Final[str] = "custom_datetime" 25 | 26 | def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: 27 | super().__init__(*args, **kwargs) 28 | 29 | def timestamp(self) -> str: 30 | utc_now = datetime.now(timezone.utc) 31 | minute = self.random.randint(0, 59) 32 | second = self.random.randint(0, 59) 33 | random_datetime = utc_now.replace(minute=minute, second=second) 34 | return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ") 35 | 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser() 39 | 40 | parser.add_argument('--api-url', help='log collector api url') 41 | parser.add_argument('--api-method', default='records', choices=['record', 'records'], 42 | help='log collector api method [record | records]') 43 | parser.add_argument('--stream-name', help='kinesis stream name') 44 | parser.add_argument('--max-count', default=15, type=int, help='max number of records to put') 45 | parser.add_argument('--dry-run', action='store_true') 46 | 47 | options = parser.parse_args() 48 | 49 | _field = Field(locale=Locale.EN) 50 | _field._generic.add_provider(CustomDatetime) 51 | 52 | schema_definition = lambda: { 53 | "userId": _field("uuid"), 54 | "sessionId": _field("token_hex", entropy=12), 55 | "referrer": _field("internet.hostname"), 56 | "userAgent": _field("internet.user_agent"), 57 | "ip": _field("internet.ip_v4"), 58 | "hostname": _field("internet.hostname"), 59 | "os": _field("development.os"), 60 | "timestamp": _field("custom_datetime.timestamp"), 61 | "uri": _field("internet.uri", query_params_count=2) 62 | } 63 | schema = Schema(schema=schema_definition, iterations=options.max_count) 64 | 65 | log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None 66 | 67 | for record in schema: 68 | if options.dry_run: 69 | print(json.dumps(record), file=sys.stderr) 70 | continue 71 | 72 | partition_key = record['userId'] 73 | if options.api_method == 'record': 74 | data = {'Data': record, 'PartitionKey': partition_key} 75 | payload = f'{json.dumps(data)}' 76 | else: 77 | #XXX: make sure data has newline 78 | data = {"records":[{'data': f'{json.dumps(record)}\n', 'partition-key': partition_key}]} 79 | payload = json.dumps(data) 80 | 81 | res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'}) 82 | if res.status_code == 200: 83 | print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr) 84 | else: 85 | print(f'[{res.status_code} {res.reason}]', file=sys.stderr) 86 | sys.exit(1) 87 | time.sleep(0.5) 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /web-analytics-parquet/src/utils/kds_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import argparse 6 | import pprint 7 | import random 8 | import time 9 | 10 | import boto3 11 | 12 | random.seed(47) 13 | 14 | SHARD_ITER_TYPE = ('TRIM_HORIZON', 'LATEST') 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--stream-name', action="store", help='kinesis stream name') 20 | parser.add_argument('--shard-id', action="store", help='kinesis stream shard-id') 21 | parser.add_argument('--iter-type', choices=SHARD_ITER_TYPE, default='LATEST', 22 | help='kinesis stream shard iterator type: [{}]'.format(', '.join(SHARD_ITER_TYPE))) 23 | parser.add_argument('--region-name', action='store', default='us-east-1', 24 | help='aws region name (default: us-east-1)') 25 | 26 | options = parser.parse_args() 27 | 28 | stream_name, shard_iter_type = options.stream_name, options.iter_type 29 | 30 | kinesis_client = boto3.client('kinesis', region_name=options.region_name) 31 | response = kinesis_client.describe_stream(StreamName=stream_name) 32 | if options.shard_id: 33 | shard_id = options.shard_id 34 | else: 35 | shard_id_list = [e['ShardId'] for e in response['StreamDescription']['Shards']] 36 | shard_id = random.choice(shard_id_list) 37 | 38 | shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name, 39 | ShardId=shard_id, 40 | ShardIteratorType=shard_iter_type) 41 | 42 | shard_iter = shard_iterator['ShardIterator'] 43 | record_response = kinesis_client.get_records(ShardIterator=shard_iter, Limit=123) 44 | pprint.pprint(record_response.get('Records', []), indent=2) 45 | 46 | while 'NextShardIterator' in record_response: 47 | record_response = kinesis_client.get_records(ShardIterator=record_response['NextShardIterator'], Limit=123) 48 | pprint.pprint(record_response.get('Records', []), indent=2) 49 | 50 | # wait for a few seconds 51 | time.sleep(5) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | 56 | --------------------------------------------------------------------------------