├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── web-analytics-datafirehose-iceberg
    ├── .example.cdk.context.json
    ├── .gitignore
    ├── README.md
    ├── app.py
    ├── assets
    │   ├── amazon-athena-switching-to-workgroup.png
    │   ├── wa-iceberg-data-level-01.png
    │   ├── wa-iceberg-data-level-02.png
    │   ├── wa-iceberg-data-level-03.png
    │   └── wa-iceberg-table.png
    ├── build-aws-lambda-layer-package.sh
    ├── cdk.json
    ├── cdk_stacks
    │   ├── __init__.py
    │   ├── apigw.py
    │   ├── firehose_data_proc_lambda.py
    │   ├── firehose_role.py
    │   ├── firehose_to_iceberg.py
    │   ├── lake_formation.py
    │   └── s3.py
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── source.bat
    ├── src
    │   ├── main
    │   │   └── python
    │   │   │   └── IcebergTransformer
    │   │   │       └── firehose_to_iceberg_transformer.py
    │   └── utils
    │   │   └── gen_fake_data.py
    └── web-analytics-datafirehose-iceberg-arch.svg
├── web-analytics-iceberg
    ├── .example.cdk.context.json
    ├── .gitignore
    ├── README.md
    ├── app.py
    ├── assets
    │   ├── amazon-athena-switching-to-workgroup.png
    │   ├── wa-iceberg-data-level-01.png
    │   ├── wa-iceberg-data-level-02.png
    │   ├── wa-iceberg-data-level-03.png
    │   └── wa-iceberg-table.png
    ├── build-aws-lambda-layer-package.sh
    ├── cdk.json
    ├── cdk_stacks
    │   ├── __init__.py
    │   ├── apigw.py
    │   ├── firehose_data_proc_lambda.py
    │   ├── firehose_role.py
    │   ├── firehose_to_iceberg.py
    │   ├── kds.py
    │   ├── lake_formation.py
    │   └── s3.py
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── source.bat
    ├── src
    │   ├── main
    │   │   └── python
    │   │   │   └── IcebergTransformer
    │   │   │       └── firehose_to_iceberg_transformer.py
    │   └── utils
    │   │   ├── gen_fake_data.py
    │   │   └── kds_consumer.py
    └── web-analytics-iceberg-arch.svg
└── web-analytics-parquet
    ├── .example.cdk.context.json
    ├── .gitignore
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── app.py
    ├── assets
        ├── amazon-athena-switching-to-workgroup.png
        └── data-lake-formation-permissions.png
    ├── build-aws-lambda-layer-package.sh
    ├── cdk.json
    ├── cdk_stacks
        ├── __init__.py
        ├── apigw.py
        ├── athena_named_query.py
        ├── athena_workgroup.py
        ├── firehose.py
        ├── firehose_dtata_transform_lambda.py
        ├── glue_catalog_database.py
        ├── kds.py
        ├── lake_formation.py
        └── merge_small_files_lambda.py
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── source.bat
    ├── src
        ├── main
        │   └── python
        │   │   ├── MergeSmallFiles
        │   │       └── athena_ctas.py
        │   │   └── SchemaValidator
        │   │       └── schema_validator.py
        └── utils
        │   ├── gen_fake_data.py
        │   └── kds_consumer.py
    └── web-analytics-arch.svg


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | package-lock.json
 4 | __pycache__
 5 | .pytest_cache
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Web Log Analytics with Amazon Kinesis Data Streams Proxy using Amazon API Gateway
 3 | 
 4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system.<br/>
 5 | Below diagram shows what we are implementing.
 6 | 
 7 | | Example | Architecture Diagram | Tags |
 8 | |---------|----------------------|------|
 9 | | [Web Log Analytics System with Parquet data format](./web-analytics-parquet) | ![](./web-analytics-parquet/web-analytics-arch.svg) | Amazon API Gateway, Amazon Kinesis Data Streams, Amazon Data Firehose, Amazon S3 + Parquet, Amazon Athena, AWS Lambda, Amazon Event Bridge |
10 | | [Web Log Analytics System with Apache Iceberg table](./web-analytics-iceberg) | ![](./web-analytics-iceberg/web-analytics-iceberg-arch.svg) | Amazon API Gateway, Amazon Kinesis Data Streams, Amazon Data Firehose, Amazon S3 + Apache Iceberg, Amazon Athena, AWS Lambda |
11 | | [Web Log Analytics System using API Gateway integrated with Data Firehose with Apache Iceberg table](./web-analytics-datafirehose-iceberg) | ![](./web-analytics-datafirehose-iceberg/web-analytics-datafirehose-iceberg-arch.svg) | Amazon API Gateway, Amazon Data Firehose, Amazon S3 + Apache Iceberg, Amazon Athena, AWS Lambda |
12 | 
13 | ## Security
14 | 
15 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
16 | 
17 | ## License
18 | 
19 | This library is licensed under the MIT-0 License. See the LICENSE file.
20 | 
21 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/.example.cdk.context.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "acknowledged-issue-numbers": [
 3 |     32775
 4 |   ],
 5 |   "firehose_data_tranform_lambda": {
 6 |     "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources",
 7 |     "s3_object_key": "var/fastavro-lib-1.10.0-py-3.11.zip"
 8 |   },
 9 |   "data_firehose_configuration": {
10 |     "stream_name": "PUT-Firehose-aEhWz",
11 |     "buffering_hints": {
12 |       "interval_in_seconds": 60,
13 |       "size_in_mbs": 128
14 |     },
15 |     "transform_records_with_aws_lambda": {
16 |       "buffer_size": 3,
17 |       "buffer_interval": 300,
18 |       "number_of_retries": 3
19 |     },
20 |     "destination_iceberg_table_configuration": {
21 |       "database_name": "web_log_iceberg_db",
22 |       "table_name": "web_log_iceberg",
23 |       "unique_keys": [
24 |         "user_id", "timestamp"
25 |       ]
26 |     },
27 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
28 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | package-lock.json
 4 | __pycache__
 5 | .pytest_cache
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Web Log Analytics with Amazon Data Firehose Proxy using Amazon API Gateway
  3 | 
  4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system.
  5 | 
  6 | Below diagram shows what we are implementing.
  7 | 
  8 | ![web-analytics-arch](web-analytics-datafirehose-iceberg-arch.svg)
  9 | 
 10 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 11 | 
 12 | This project is set up like a standard Python project.  The initialization
 13 | process also creates a virtualenv within this project, stored under the `.venv`
 14 | directory.  To create the virtualenv it assumes that there is a `python3`
 15 | (or `python` for Windows) executable in your path with access to the `venv`
 16 | package. If for any reason the automatic creation of the virtualenv fails,
 17 | you can create the virtualenv manually.
 18 | 
 19 | To manually create a virtualenv on MacOS and Linux:
 20 | 
 21 | ```
 22 | $ python3 -m venv .venv
 23 | ```
 24 | 
 25 | After the init process completes and the virtualenv is created, you can use the following
 26 | step to activate your virtualenv.
 27 | 
 28 | ```
 29 | $ source .venv/bin/activate
 30 | ```
 31 | 
 32 | If you are a Windows platform, you would activate the virtualenv like this:
 33 | 
 34 | ```
 35 | % .venv\Scripts\activate.bat
 36 | ```
 37 | 
 38 | Once the virtualenv is activated, you can install the required dependencies.
 39 | 
 40 | ```
 41 | (.venv) $ pip install -r requirements.txt
 42 | ```
 43 | 
 44 | To add additional dependencies, for example other CDK libraries, just add
 45 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
 46 | command.
 47 | 
 48 | ### Upload Lambda Layer code
 49 | 
 50 | Before deployment, you should uplad zipped code files to s3 like this:
 51 | <pre>
 52 | (.venv) $ aws s3api create-bucket --bucket <i>your-s3-bucket-name-for-lambda-layer-code</i> --region <i>region-name</i>
 53 | (.venv) $ ./build-aws-lambda-layer-package.sh <i>your-s3-bucket-name-for-lambda-layer-code</i>
 54 | </pre>
 55 | 
 56 | > :warning: To create a bucket outside of the `us-east-1` region, `aws s3api create-bucket` command requires the appropriate **LocationConstraint** to be specified in order to create the bucket in the desired region. For more information, see these [examples](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3api/create-bucket.html#examples).
 57 | 
 58 | > :warning: Make sure you have **Docker** installed.
 59 | 
 60 | For example,
 61 | <pre>
 62 | (.venv) $ aws s3api create-bucket --bucket lambda-layer-resources --region <i>us-east-1</i>
 63 | (.venv) $ ./build-aws-lambda-layer-package.sh lambda-layer-resources
 64 | </pre>
 65 | 
 66 | For more information about how to create a package for Amazon Lambda Layer, see [here](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/).
 67 | 
 68 | ### Deploy
 69 | 
 70 | Before to synthesize the CloudFormation template for this code, you should update `cdk.context.json` file.<br/>
 71 | In particular, you need to fill the s3 location of the previously created lambda lay codes.
 72 | 
 73 | For example,
 74 | <pre>
 75 | {
 76 |   "firehose_data_tranform_lambda": {
 77 |     "s3_bucket_name": "<i>lambda-layer-resources</i>",
 78 |     "s3_object_key": "<i>var/fastavro-lib.zip</i>"
 79 |   },
 80 |   "data_firehose_configuration": {
 81 |     "stream_name": "PUT-Firehose-aEhWz",
 82 |     "buffering_hints": {
 83 |       "interval_in_seconds": 60,
 84 |       "size_in_mbs": 128
 85 |     },
 86 |     "transform_records_with_aws_lambda": {
 87 |       "buffer_size": 3,
 88 |       "buffer_interval": 300,
 89 |       "number_of_retries": 3
 90 |     },
 91 |     "destination_iceberg_table_configuration": {
 92 |       "database_name": "web_log_iceberg_db",
 93 |       "table_name": "web_log_iceberg"
 94 |     },
 95 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
 96 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
 97 |   }
 98 | }
 99 | </pre>
100 | :information_source: `database_name`, and `table_name` of `data_firehose_configuration.destination_iceberg_table_configuration` is used in [**Set up Delivery Stream**](#set-up-delivery-stream) step.
101 | 
102 | :information_source: When updating or deleting records in an Iceberg table, specify the table's primary key column name as `unique_keys` in the `data_firehose_configuration.destination_iceberg_table_configuration` settings.
103 | For example,
104 | <pre>
105 | "destination_iceberg_table_configuration": {
106 |   "database_name": "web_log_iceberg_db",
107 |   "table_name": "web_log_iceberg",
108 |   "unique_keys": [
109 |     "user_id", "timestamp"
110 |   ]
111 | }
112 | </pre>
113 | 
114 | 
115 | Now you are ready to synthesize the CloudFormation template for this code.<br/>
116 | 
117 | <pre>
118 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
119 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
120 | (.venv) $ cdk synth --all
121 | </pre>
122 | 
123 | Now let's try to deploy.
124 | 
125 | ## List all CDK Stacks
126 | 
127 | ```
128 | (.venv) $ cdk list
129 | WebAnalyticsDataFirehoseProxyApiGw
130 | WebAnalyticsDataFirehoseToIcebergS3Path
131 | WebAnalyticsFirehoseDataTransformLambdaStack
132 | WebAnalyticsFirehoseToIcebergRoleStack
133 | WebAnalyticsGrantLFPermissionsOnFirehoseRole
134 | WebAnalyticsFirehoseToIcebergStack
135 | ```
136 | 
137 | Use `cdk deploy` command to create the stack shown above.
138 | 
139 | ## Create API endpoint for web data collection
140 | 
141 | <pre>
142 | (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseProxyApiGw
143 | </pre>
144 | 
145 | ## Set up Delivery Stream
146 | 
147 | 1. Create a S3 bucket for Apache Iceberg table
148 |    <pre>
149 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseToIcebergS3Path
150 |    </pre>
151 | 2. Create a table with partitioned data in Amazon Athena
152 | 
153 |    Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.<br/>
154 |    * (step 1) Create a database
155 | 
156 |       In order to create a new database called `web_log_iceberg_db`, enter the following statement in the Athena query editor and click the **Run** button to execute the query.
157 | 
158 |       <pre>
159 |       CREATE DATABASE IF NOT EXISTS web_log_iceberg_db;
160 |       </pre>
161 | 
162 |    * (step 2) Create a table
163 | 
164 |       Copy the following query into the Athena query editor.
165 | 
166 |       Update `LOCATION` to your S3 bucket name and execute the query to create a new table.
167 |       <pre>
168 |       CREATE TABLE web_log_iceberg_db.web_log_iceberg (
169 |         `user_id` string,
170 |         `session_id` string,
171 |         `event` string,
172 |         `referrer` string,
173 |         `user_agent` string,
174 |         `ip` string,
175 |         `hostname` string,
176 |         `os` string,
177 |         `timestamp` timestamp,
178 |         `uri` string
179 |       )
180 |       PARTITIONED BY (event)
181 |       LOCATION 's3://web-analytics-<i>{region}</i>-</i>{account_id}</i>/web_log_iceberg_db/web_log_iceberg'
182 |       TBLPROPERTIES (
183 |         'table_type'='iceberg',
184 |         'format'='parquet',
185 |         'write_compression'='snappy',
186 |         'optimize_rewrite_delete_file_threshold'='10'
187 |       );
188 |       </pre>
189 |       If the query is successful, a table named `web_log_iceberg` is created and displayed on the left panel under the **Tables** section.
190 | 
191 |       If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have `web_log_iceberg_db` selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**.
192 | 3. Create a lambda function to process the streaming data.
193 |    <pre>
194 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseDataTransformLambdaStack
195 |    </pre>
196 | 4. To allow Data Firehose to ingest data into the Apache Iceberg table, create an IAM role and grant permissions to the role.
197 |    <pre>
198 |    (.venv) $ cdk deploy --require-approval never \
199 |                  WebAnalyticsFirehoseToIcebergRoleStack \
200 |                  WebAnalyticsGrantLFPermissionsOnFirehoseRole
201 |    </pre>
202 | 
203 |    :information_source: If you fail to create the table, give Athena users access permissions on `web_log_iceberg_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant Amazon Data Firehose to access `web_log_iceberg_db` by running the following command:
204 |    <pre>
205 |    (.venv) $ aws lakeformation grant-permissions \
206 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>role-id</i> \
207 |                  --permissions CREATE_TABLE DESCRIBE ALTER DROP \
208 |                  --resource '{ "Database": { "Name": "<i>web_log_iceberg_db</i>" } }'
209 |    (.venv) $ aws lakeformation grant-permissions \
210 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>role-id</i> \
211 |                  --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
212 |                  --resource '{ "Table": {"DatabaseName": "<i>web_log_iceberg_db</i>", "TableWildcard": {}} }'
213 |    </pre>
214 | 5. Deploy Amazon Data Firehose.
215 |    <pre>
216 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseToIcebergStack
217 |    </pre>
218 | 
219 | ## Run Test
220 | 
221 | 1. Run `GET /streams` method to invoke `ListDeliveryStreams` in Kinesis
222 |    <pre>
223 |    $ curl -X GET https://<i>your-api-gateway-id</i>.execute-api.us-east-1.amazonaws.com/v1/streams
224 |    </pre>
225 | 
226 |    The response is:
227 |    <pre>
228 |    {
229 |      "DeliveryStreamNames": [
230 |        "PUT-Firehose-aEhWz"
231 |      ],
232 |      "HasMoreDeliveryStreams": false
233 |    }
234 |    </pre>
235 | 
236 | 2. Generate test data.
237 |    <pre>
238 |    (.venv) $ pip install -r requirements-dev.txt
239 |    (.venv) $ python src/utils/gen_fake_data.py \
240 |                  --stream-name <i>PUT-Firehose-aEhWz</i> \
241 |                  --api-url 'https://<i>your-api-gateway-id</i>.execute-api.us-east-1.amazonaws.com/v1' \
242 |                  --api-method records \
243 |                  --max-count 5
244 | 
245 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"NxB5xOO4Y30ppGBZFfpDoREir/dcWwsF1j4NAie9K1N5pqpjZCSkJPM+7I+Wx7gB/H6hS1BUFGLVIQlR/xEsi7WzT6uA/JX4nXndcF7gxhn3UFGEyyFcgDXyjot5lCFJ5UNnhJk8gAeYT0Ghxj3BNTI22hgrfqdDnjo5MoAg8/0us408pDL37EF4DpIkFMAXWdZdwLRcS6cDt0o0XADBV17XwJnilrSv"}]}
246 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"slrDNLj+LPl1BAi6LzUVvUrhICOdnBY48gIG09zDGb/8fJElu3pYyTdfdNk9V+06rHz/ZY9RoV/0+UapEHaDDVqSjeDQZyZx0HeB2UDVP167Iv1DMgDvDIAiVlwcAyEsfUloqtRekM/B4NHEteJvCrPpqeQV8kYqk6EE1yJvJiLhBnyTVEuoVWbW4qiD+djsgijfL4EufK4ahdQN+CYs70HdUTEdQiV0"}]}
247 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"WGUixKjKAE3aXVe3FbhoRGVh1WomWht8/S1lqhUa6IhxN+tskX5xxO3PjsukPSDDMd9J5LwfzwSh7tt9PQMaqh2r6JDTvP3X3wFItGGrhqY6UD52zs/Z9WINpa1HWcl677xk/qec61gvD5QOpTXWmfG2Q/uWwuboIHoKIqigxeqMpsRpPH40TA6m0HF9AJVrZ5a2VI+OhEK9V/5VkaTI5aQ+Gltl/TSj"}]}
248 |    [200 OK] {"Encrypted":false,"FailedPutCount":0,"RequestResponses":[{"RecordId":"sOiJXXhDffAuoLaOm7E3y/8GIb9bwVbqrUcfotKT4H2iVQs3sPO1BxVwuaCMpfL8sQwpL4TSg5Y3EfLOzjrGlEOa4D14a3GAuffMQSEBVlwuJDED4JcFHJ/ltekVK/pMyejbBjyVk4e+S1oFK1LaXiGrcrVJ6XzJBk/NDnRLxGLYy+takFZMfyaStcZxXonnmdqw8YwWGgGnsbwj2nGVkR9PBWdyh41l"}]}
249 |    </pre>
250 | 
251 | 3. Check streaming data in S3
252 | 
253 |    After `5~10` minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3**.
254 | 
255 |    ![iceberg-table](./assets/wa-iceberg-table.png)
256 |    ![iceberg-table-data-level-01](./assets/wa-iceberg-data-level-01.png)
257 |    ![iceberg-table-data-level-02](./assets/wa-iceberg-data-level-02.png)
258 |    ![iceberg-table-data-level-03](./assets/wa-iceberg-data-level-03.png)
259 | 
260 | 4. Run test query using Amazon Athena
261 | 
262 |    Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
263 | 
264 |    * (Step 1) Specify the workgroup to use
265 | 
266 |      To run queries, switch to the appropriate workgroup like this:
267 |       ![amazon-athena-switching-to-workgroup](./assets/amazon-athena-switching-to-workgroup.png)
268 | 
269 |    * (Step 2) Run test query
270 | 
271 |      Enter the following SQL statement and execute the query.
272 |      <pre>
273 |      SELECT COUNT(*)
274 |      FROM web_log_iceberg_db.web_log_iceberg;
275 |      </pre>
276 | 
277 | ## Clean Up
278 | 
279 | Delete the CloudFormation stack by running the below command.
280 | <pre>
281 | (.venv) $ cdk destroy --force --all
282 | </pre>
283 | 
284 | 
285 | ## Useful commands
286 | 
287 |  * `cdk ls`          list all stacks in the app
288 |  * `cdk synth`       emits the synthesized CloudFormation template
289 |  * `cdk deploy`      deploy this stack to your default AWS account/region
290 |  * `cdk diff`        compare deployed stack with current state
291 |  * `cdk docs`        open CDK documentation
292 | 
293 | Enjoy!
294 | 
295 | ## References
296 | 
297 |  * [Web Analytics](https://en.wikipedia.org/wiki/Web_analytics)
298 |  * [Tutorial: Create a REST API as an Amazon Kinesis proxy in API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html)
299 |  * [Streaming Data Solution for Amazon Kinesis](https://aws.amazon.com/ko/solutions/implementations/aws-streaming-data-solution-for-amazon-kinesis/)
300 |    <div>
301 |      <img src="https://d1.awsstatic.com/Solutions/Solutions%20Category%20Template%20Draft/Solution%20Architecture%20Diagrams/aws-streaming-data-using-api-gateway-architecture.1b9d28f061fe84385cb871ec58ccad18c7265d22.png", alt with="385" height="204">
302 |    </div>
303 |  * [(AWS Developer Guide) Deliver data to Apache Iceberg Tables with Amazon Data Firehose](https://docs.aws.amazon.com/firehose/latest/dev/apache-iceberg-destination.html)
304 |  * [Building fine-grained authorization using Amazon Cognito, API Gateway, and IAM](https://aws.amazon.com/ko/blogs/security/building-fine-grained-authorization-using-amazon-cognito-api-gateway-and-iam/)
305 |  * [AWS Lake Formation - Create a data lake administrator](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin)
306 |  * [AWS Lake Formation Permissions Reference](https://docs.aws.amazon.com/lake-formation/latest/dg/lf-permissions-reference.html)
307 |  * [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html)
308 |  * [Amazon Athena Workshop](https://athena-in-action.workshop.aws/)
309 |  * [Curl Cookbook](https://catonmat.net/cookbooks/curl)
310 |  * [fastavro](https://fastavro.readthedocs.io/) - Fast read/write of `AVRO` files
311 |  * [Apache Avro Specification](https://avro.apache.org/docs/current/spec.html)
312 |  * [How to create a Lambda layer using a simulated Lambda environment with Docker](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/)
313 |    ```
314 |    $ cat <<EOF > requirements-Lambda-Layer.txt
315 |    > fastavro==1.6.1
316 |    > EOF
317 |    $ docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install -r requirements-Lambda-Layer.txt -t python/lib/python3.11/site-packages/; exit"
318 |    $ zip -r fastavro-lib.zip python > /dev/null
319 |    $ aws s3 mb s3://my-bucket-for-lambda-layer-packages
320 |    $ aws s3 cp fastavro-lib.zip s3://my-bucket-for-lambda-layer-packages/
321 |    ```
322 | 
323 | ## Security
324 | 
325 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
326 | 
327 | ## License
328 | 
329 | This library is licensed under the MIT-0 License. See the LICENSE file.
330 | 
331 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from cdk_stacks import (
10 |   DataFirehoseProxyStack,
11 |   FirehoseToIcebergStack,
12 |   FirehoseRoleStack,
13 |   FirehoseDataProcLambdaStack,
14 |   DataLakePermissionsStack,
15 |   S3BucketStack,
16 | )
17 | 
18 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
19 |   region=os.getenv('CDK_DEFAULT_REGION'))
20 | 
21 | app = cdk.App()
22 | 
23 | kds_proxy_apigw = DataFirehoseProxyStack(app, 'WebAnalyticsDataFirehoseProxyApiGw')
24 | 
25 | s3_dest_bucket = S3BucketStack(app, 'WebAnalyticsDataFirehoseToIcebergS3Path',
26 |   env=AWS_ENV)
27 | 
28 | firehose_data_transform_lambda = FirehoseDataProcLambdaStack(app,
29 |   'WebAnalyticsFirehoseDataTransformLambdaStack',
30 |   env=AWS_ENV
31 | )
32 | firehose_data_transform_lambda.add_dependency(s3_dest_bucket)
33 | 
34 | firehose_role = FirehoseRoleStack(app, 'WebAnalyticsFirehoseToIcebergRoleStack',
35 |   firehose_data_transform_lambda.data_proc_lambda_fn,
36 |   s3_dest_bucket.s3_bucket,
37 |   env=AWS_ENV
38 | )
39 | firehose_role.add_dependency(firehose_data_transform_lambda)
40 | 
41 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnFirehoseRole',
42 |   firehose_role.firehose_role,
43 |   env=AWS_ENV
44 | )
45 | grant_lake_formation_permissions.add_dependency(firehose_role)
46 | 
47 | firehose_stack = FirehoseToIcebergStack(app, 'WebAnalyticsFirehoseToIcebergStack',
48 |   firehose_data_transform_lambda.data_proc_lambda_fn,
49 |   s3_dest_bucket.s3_bucket,
50 |   firehose_role.firehose_role,
51 |   env=AWS_ENV
52 | )
53 | firehose_stack.add_dependency(grant_lake_formation_permissions)
54 | 
55 | app.synth()
56 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/assets/amazon-athena-switching-to-workgroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/amazon-athena-switching-to-workgroup.png


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-01.png


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-02.png


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-data-level-03.png


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/assets/wa-iceberg-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-datafirehose-iceberg/assets/wa-iceberg-table.png


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/build-aws-lambda-layer-package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -
 2 | 
 3 | VERSION=1.10.0
 4 | PY_VERSION=3.11
 5 | LAMBDA_LAYER_NAME=fastavro-lib-${VERSION}-py-${PY_VERSION}
 6 | S3_PATH=$1
 7 | 
 8 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install fastavro==${VERSION} -t python/lib/python3.11/site-packages/; exit"
 9 | 
10 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null
11 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip
12 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip"
13 | 
14 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ]
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .apigw import DataFirehoseProxyStack
2 | from .firehose_to_iceberg import FirehoseToIcebergStack
3 | from .firehose_role import FirehoseRoleStack
4 | from .firehose_data_proc_lambda import FirehoseDataProcLambdaStack
5 | from .lake_formation import DataLakePermissionsStack
6 | from .s3 import S3BucketStack


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/apigw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import json
  6 | 
  7 | import aws_cdk as cdk
  8 | 
  9 | from aws_cdk import (
 10 |   Stack,
 11 |   aws_apigateway,
 12 |   aws_iam
 13 | )
 14 | from constructs import Construct
 15 | 
 16 | 
 17 | class DataFirehoseProxyStack(Stack):
 18 | 
 19 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
 20 |     super().__init__(scope, construct_id, **kwargs)
 21 | 
 22 |     apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument()
 23 |     apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 24 |       "effect": aws_iam.Effect.ALLOW,
 25 |       "resources": ["*"],
 26 |       "actions": [
 27 |         "firehose:DescribeDeliveryStream",
 28 |         "firehose:PutRecord",
 29 |         "firehose:PutRecordBatch"
 30 |       ]
 31 |     }))
 32 | 
 33 |     apigw_datafirehose_role = aws_iam.Role(self, "APIGatewayRoleToAccessDataFirehose",
 34 |       role_name=f"APIGatewayRoleToAccessDataFirehose",
 35 |       assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'),
 36 |       inline_policies={
 37 |         'DataFirehoseWriteAccess': apigw_kds_access_role_policy_doc
 38 |       },
 39 |       managed_policies=[
 40 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisFirehoseReadOnlyAccess')
 41 |       ]
 42 |     )
 43 | 
 44 |     #XXX: Start to create an API as a Kinesis proxy
 45 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy
 46 |     datafirehose_proxy_api = aws_apigateway.RestApi(self, "DataFirehoseProxyAPI",
 47 |       rest_api_name="log-collector",
 48 |       description="An Amazon API Gateway REST API that integrated with an Amazon Data Firehose.",
 49 |       endpoint_types=[aws_apigateway.EndpointType.REGIONAL],
 50 |       default_cors_preflight_options={
 51 |         "allow_origins": aws_apigateway.Cors.ALL_ORIGINS
 52 |       },
 53 |       deploy=True,
 54 |       deploy_options=aws_apigateway.StageOptions(stage_name="v1"),
 55 |       endpoint_export_name="DataFirehoseProxyAPIEndpoint"
 56 |     )
 57 | 
 58 |     apigw_error_responses = [
 59 |       aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"),
 60 |       aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}")
 61 |     ]
 62 | 
 63 |     #XXX: GET /streams
 64 |     # List Kinesis streams by using the API Gateway console
 65 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams
 66 | 
 67 |     streams_resource = datafirehose_proxy_api.root.add_resource("streams")
 68 | 
 69 |     list_streams_options = aws_apigateway.IntegrationOptions(
 70 |       credentials_role=apigw_datafirehose_role,
 71 |       integration_responses=[
 72 |         aws_apigateway.IntegrationResponse(
 73 |           status_code="200"
 74 |         ),
 75 |         *apigw_error_responses
 76 |       ],
 77 |       request_templates={
 78 |         'application/json': '{}'
 79 |       },
 80 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
 81 |     )
 82 | 
 83 |     list_streams_integration = aws_apigateway.AwsIntegration(
 84 |       service='firehose',
 85 |       action='ListDeliveryStreams',
 86 |       integration_http_method='POST',
 87 |       options=list_streams_options
 88 |     )
 89 | 
 90 |     streams_resource.add_method("GET", list_streams_integration,
 91 |       # Default `authorization_type`: - open access unless `authorizer` is specified
 92 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
 93 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
 94 |           response_models={
 95 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
 96 |           }
 97 |         ),
 98 |         aws_apigateway.MethodResponse(status_code='400'),
 99 |         aws_apigateway.MethodResponse(status_code='500')
100 |         ])
101 | 
102 |     #XXX: GET /streams/{stream-name}
103 |     # Describe a stream in Kinesis
104 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream
105 |     one_stream_resource = streams_resource.add_resource("{stream-name}")
106 | 
107 |     describe_stream_options = aws_apigateway.IntegrationOptions(
108 |       credentials_role=apigw_datafirehose_role,
109 |       integration_responses=[
110 |         aws_apigateway.IntegrationResponse(
111 |           status_code="200"
112 |         ),
113 |         *apigw_error_responses
114 |       ],
115 |       request_templates={
116 |         'application/json': json.dumps({
117 |             "DeliveryStreamName": "$input.params('stream-name')"
118 |           }, indent=2)
119 |       },
120 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
121 |     )
122 | 
123 |     describe_stream_integration = aws_apigateway.AwsIntegration(
124 |       service='firehose',
125 |       action='DescribeDeliveryStream',
126 |       integration_http_method='POST',
127 |       options=describe_stream_options
128 |     )
129 | 
130 |     one_stream_resource.add_method("GET", describe_stream_integration,
131 |       # Default `authorization_type`: - open access unless `authorizer` is specified
132 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
133 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
134 |           response_models={
135 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
136 |           }
137 |         ),
138 |         aws_apigateway.MethodResponse(status_code='400'),
139 |         aws_apigateway.MethodResponse(status_code='500')
140 |         ])
141 | 
142 |     #XXX: PUT /streams/{stream-name}/record
143 |     # Put a record into a stream in Kinesis
144 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
145 |     record_resource = one_stream_resource.add_resource("record")
146 | 
147 |     put_record_request_mapping_templates = '''
148 | {
149 |   "DeliveryStreamName": "$input.params('stream-name')",
150 |   "Record": {
151 |     "Data": "$util.base64Encode($input.json('$.Data'))"
152 |   }
153 | }
154 | '''
155 | 
156 |     put_record_options = aws_apigateway.IntegrationOptions(
157 |       credentials_role=apigw_datafirehose_role,
158 |       integration_responses=[
159 |         aws_apigateway.IntegrationResponse(
160 |           status_code="200"
161 |         ),
162 |         *apigw_error_responses
163 |       ],
164 |       request_templates={
165 |         'application/json': put_record_request_mapping_templates
166 |       },
167 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
168 |     )
169 | 
170 |     put_record_integration = aws_apigateway.AwsIntegration(
171 |       service='firehose',
172 |       action='PutRecord',
173 |       integration_http_method='POST',
174 |       options=put_record_options
175 |     )
176 | 
177 |     record_resource.add_method("PUT", put_record_integration,
178 |       # Default `authorization_type`: - open access unless `authorizer` is specified
179 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
180 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
181 |           response_models={
182 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
183 |           }
184 |         ),
185 |         aws_apigateway.MethodResponse(status_code='400'),
186 |         aws_apigateway.MethodResponse(status_code='500')
187 |         ])
188 | 
189 | 
190 |     #XXX: PUT /streams/{stream-name}/records
191 |     # Put records into a stream in Kinesis
192 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
193 |     records_resource = one_stream_resource.add_resource("records")
194 | 
195 |     put_records_request_mapping_templates = '''
196 | {
197 |   "DeliveryStreamName": "$input.params('stream-name')",
198 |   "Records": [
199 |     #foreach($elem in $input.path('$.records'))
200 |       {
201 |         "Data": "$util.base64Encode($elem.data)"
202 |       }#if($foreach.hasNext),#end
203 |     #end
204 |   ]
205 | }
206 | '''
207 | 
208 |     put_records_options = aws_apigateway.IntegrationOptions(
209 |       credentials_role=apigw_datafirehose_role,
210 |       integration_responses=[
211 |         aws_apigateway.IntegrationResponse(
212 |           status_code="200"
213 |         ),
214 |         *apigw_error_responses
215 |       ],
216 |       request_templates={
217 |         'application/json': put_records_request_mapping_templates
218 |       },
219 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
220 |     )
221 | 
222 |     put_records_integration = aws_apigateway.AwsIntegration(
223 |       service='firehose',
224 |       action='PutRecordBatch',
225 |       integration_http_method='POST',
226 |       options=put_records_options
227 |     )
228 | 
229 |     records_resource.add_method("PUT", put_records_integration,
230 |       # Default `authorization_type`: - open access unless `authorizer` is specified
231 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
232 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
233 |           response_models={
234 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
235 |           }
236 |         ),
237 |         aws_apigateway.MethodResponse(status_code='400'),
238 |         aws_apigateway.MethodResponse(status_code='500')
239 |         ])
240 | 
241 | 
242 |     cdk.CfnOutput(self, 'RestApiEndpointUrl',
243 |       value=datafirehose_proxy_api.url,
244 |       export_name=f'{self.stack_name}-RestApiEndpointUrl')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/firehose_data_proc_lambda.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_lambda,
12 |   aws_logs,
13 |   aws_s3 as s3
14 | )
15 | from constructs import Construct
16 | 
17 | 
18 | class FirehoseDataProcLambdaStack(Stack):
19 | 
20 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
21 |     super().__init__(scope, construct_id, **kwargs)
22 | 
23 |     firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda')
24 |     LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name']
25 |     LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key']
26 | 
27 |     s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET)
28 |     lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib",
29 |       layer_version_name="fastavro-lib",
30 |       compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11],
31 |       code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY)
32 |     )
33 | 
34 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
35 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
36 |     dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None)
37 |     dest_iceberg_table_unique_keys = ",".join(dest_iceberg_table_unique_keys) if dest_iceberg_table_unique_keys else ""
38 | 
39 |     LAMBDA_FN_NAME = "WebAnalyticsFirehoseToIcebergTransformer"
40 |     self.data_proc_lambda_fn = aws_lambda.Function(self, "FirehoseToIcebergTransformer",
41 |       runtime=aws_lambda.Runtime.PYTHON_3_11,
42 |       function_name=LAMBDA_FN_NAME,
43 |       handler="firehose_to_iceberg_transformer.lambda_handler",
44 |       description="Transform records to Apache Iceberg table",
45 |       code=aws_lambda.Code.from_asset(os.path.join(os.path.dirname(__file__), '../src/main/python/IcebergTransformer')),
46 |       environment={
47 |         "IcebergDatabaseName": dest_iceberg_table_config["database_name"],
48 |         "IcebergTableName": dest_iceberg_table_config["table_name"],
49 |         "IcebergTableUniqueKeys": dest_iceberg_table_unique_keys
50 |       },
51 |       timeout=cdk.Duration.minutes(5),
52 |       #XXX: set memory size appropriately
53 |       memory_size=256,
54 |       layers=[lambda_lib_layer]
55 |     )
56 | 
57 |     log_group = aws_logs.LogGroup(self, "FirehoseToIcebergTransformerLogGroup",
58 |       #XXX: Circular dependency between resources occurs
59 |       # if aws_lambda.Function.function_name is used
60 |       # instead of literal name of lambda function such as "FirehoseToIcebergTransformer"
61 |       log_group_name=f"/aws/lambda/{LAMBDA_FN_NAME}",
62 |       retention=aws_logs.RetentionDays.THREE_DAYS,
63 |       removal_policy=cdk.RemovalPolicy.DESTROY
64 |     )
65 |     log_group.grant_write(self.data_proc_lambda_fn)
66 | 
67 | 
68 |     cdk.CfnOutput(self, 'FirehoseDataProcFuncName',
69 |       value=self.data_proc_lambda_fn.function_name,
70 |       export_name=f'{self.stack_name}-FirehoseDataProcFuncName')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/firehose_role.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_iam
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | 
 14 | class FirehoseRoleStack(Stack):
 15 | 
 16 |   def __init__(self, scope: Construct, construct_id: str,
 17 |                data_transform_lambda_fn, s3_bucket, **kwargs) -> None:
 18 | 
 19 |     super().__init__(scope, construct_id, **kwargs)
 20 | 
 21 |     firehose_role_policy_doc = aws_iam.PolicyDocument()
 22 | 
 23 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 24 |       "effect": aws_iam.Effect.ALLOW,
 25 |       "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)],
 26 |       "actions": [
 27 |         "s3:AbortMultipartUpload",
 28 |         "s3:GetBucketLocation",
 29 |         "s3:GetObject",
 30 |         "s3:ListBucket",
 31 |         "s3:ListBucketMultipartUploads",
 32 |         "s3:PutObject",
 33 |         "s3:DeleteObject"
 34 |       ]
 35 |     }))
 36 | 
 37 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 38 |       "effect": aws_iam.Effect.ALLOW,
 39 |       "resources": [
 40 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog",
 41 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:database/*",
 42 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:table/*/*"
 43 |       ],
 44 |       "actions": [
 45 |         "glue:GetTable",
 46 |         "glue:GetDatabase",
 47 |         "glue:UpdateTable"
 48 |       ]
 49 |     }))
 50 | 
 51 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 52 |       effect=aws_iam.Effect.ALLOW,
 53 |       resources=["*"],
 54 |       actions=[
 55 |         "ec2:DescribeVpcs",
 56 |         "ec2:DescribeVpcAttribute",
 57 |         "ec2:DescribeSubnets",
 58 |         "ec2:DescribeSecurityGroups",
 59 |         "ec2:DescribeNetworkInterfaces",
 60 |         "ec2:CreateNetworkInterface",
 61 |         "ec2:CreateNetworkInterfacePermission",
 62 |         "ec2:DeleteNetworkInterface"
 63 |       ]
 64 |     ))
 65 | 
 66 |     #XXX: https://docs.aws.amazon.com/ko_kr/cdk/latest/guide/tokens.html
 67 |     # String-encoded tokens:
 68 |     #  Avoid manipulating the string in other ways. For example,
 69 |     #  taking a substring of a string is likely to break the string token.
 70 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
 71 |     firehose_stream_name = data_firehose_configuration['stream_name']
 72 | 
 73 |     firehose_log_group_name = f"/aws/kinesisfirehose/{firehose_stream_name}"
 74 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 75 |       effect=aws_iam.Effect.ALLOW,
 76 |       #XXX: The ARN will be formatted as follows:
 77 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name}
 78 |       resources=[self.format_arn(service="logs", resource="log-group",
 79 |         resource_name="{}:log-stream:*".format(firehose_log_group_name),
 80 |         arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)],
 81 |       actions=["logs:PutLogEvents"]
 82 |     ))
 83 | 
 84 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 85 |       "effect": aws_iam.Effect.ALLOW,
 86 |       "resources": [f"{data_transform_lambda_fn.function_arn}:*"],
 87 |       "actions": [
 88 |         "lambda:InvokeFunction",
 89 |         "lambda:GetFunctionConfiguration"
 90 |       ]
 91 |     }))
 92 | 
 93 |     self.firehose_role = aws_iam.Role(self, "KinesisFirehoseServiceRole",
 94 |       role_name=f"KinesisFirehoseServiceRole-{firehose_stream_name}-{self.region}",
 95 |       assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"),
 96 |       #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221
 97 |       inline_policies={
 98 |         "firehose_role_policy": firehose_role_policy_doc
 99 |       }
100 |     )
101 | 
102 | 
103 |     cdk.CfnOutput(self, 'FirehoseRole',
104 |       value=self.firehose_role.role_name,
105 |       export_name=f'{self.stack_name}-Role')
106 |     cdk.CfnOutput(self, 'FirehoseRoleArn',
107 |       value=self.firehose_role.role_arn,
108 |       export_name=f'{self.stack_name}-RoleArn')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/firehose_to_iceberg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_kinesisfirehose
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn_delivery_stream
 14 | 
 15 | 
 16 | class FirehoseToIcebergStack(Stack):
 17 | 
 18 |   def __init__(self, scope: Construct, construct_id: str,
 19 |                data_transform_lambda_fn, s3_bucket,
 20 |                firehose_role, **kwargs) -> None:
 21 | 
 22 |     super().__init__(scope, construct_id, **kwargs)
 23 | 
 24 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
 25 | 
 26 |     delivery_stream_name = data_firehose_configuration['stream_name']
 27 | 
 28 |     firehose_log_group_name = f"/aws/kinesisfirehose/{delivery_stream_name}"
 29 | 
 30 |     firehose_buffering_hints = data_firehose_configuration["buffering_hints"]
 31 |     firehose_buffer_size = firehose_buffering_hints["size_in_mbs"]
 32 |     firehose_buffer_interval = firehose_buffering_hints["interval_in_seconds"]
 33 | 
 34 |     transform_records_with_aws_lambda = data_firehose_configuration["transform_records_with_aws_lambda"]
 35 |     firehose_lambda_buffer_size = transform_records_with_aws_lambda["buffer_size"]
 36 |     firehose_lambda_buffer_interval = transform_records_with_aws_lambda["buffer_interval"]
 37 |     firehose_lambda_number_of_retries = transform_records_with_aws_lambda["number_of_retries"]
 38 | 
 39 |     s3_output_prefix = data_firehose_configuration["output_prefix"]
 40 |     s3_error_output_prefix = data_firehose_configuration["error_output_prefix"]
 41 | 
 42 |     lambda_proc = cfn_delivery_stream.ProcessorProperty(
 43 |       type="Lambda",
 44 |       parameters=[
 45 |         cfn_delivery_stream.ProcessorParameterProperty(
 46 |           parameter_name="LambdaArn",
 47 |           parameter_value='{}:{}'.format(
 48 |             data_transform_lambda_fn.function_arn,
 49 |             data_transform_lambda_fn.latest_version.version
 50 |           )
 51 |         ),
 52 |         cfn_delivery_stream.ProcessorParameterProperty(
 53 |           parameter_name="NumberOfRetries",
 54 |           parameter_value=str(firehose_lambda_number_of_retries)
 55 |         ),
 56 |         cfn_delivery_stream.ProcessorParameterProperty(
 57 |           parameter_name="RoleArn",
 58 |           parameter_value=firehose_role.role_arn
 59 |         ),
 60 |         cfn_delivery_stream.ProcessorParameterProperty(
 61 |           parameter_name="BufferSizeInMBs",
 62 |           parameter_value=str(firehose_lambda_buffer_size)
 63 |         ),
 64 |         cfn_delivery_stream.ProcessorParameterProperty(
 65 |           parameter_name="BufferIntervalInSeconds",
 66 |           parameter_value=str(firehose_lambda_buffer_interval)
 67 |         )
 68 |       ]
 69 |     )
 70 | 
 71 |     firehose_processing_config = cfn_delivery_stream.ProcessingConfigurationProperty(
 72 |       enabled=True,
 73 |       processors=[
 74 |         lambda_proc
 75 |       ]
 76 |     )
 77 | 
 78 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
 79 |     dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None)
 80 |     dest_iceberg_table_unique_keys = dest_iceberg_table_unique_keys if dest_iceberg_table_unique_keys else None
 81 | 
 82 |     iceberg_dest_config = cfn_delivery_stream.IcebergDestinationConfigurationProperty(
 83 |       catalog_configuration=cfn_delivery_stream.CatalogConfigurationProperty(
 84 |         catalog_arn=f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog"
 85 |       ),
 86 |       role_arn=firehose_role.role_arn,
 87 |       s3_configuration=cfn_delivery_stream.S3DestinationConfigurationProperty(
 88 |         bucket_arn=s3_bucket.bucket_arn,
 89 |         role_arn=firehose_role.role_arn,
 90 |         buffering_hints={
 91 |           "intervalInSeconds": firehose_buffer_interval,
 92 |           "sizeInMBs": firehose_buffer_size
 93 |         },
 94 |         cloud_watch_logging_options={
 95 |           "enabled": True,
 96 |           "logGroupName": firehose_log_group_name,
 97 |           "logStreamName": "DestinationDelivery"
 98 |         },
 99 |         compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP]
100 |         error_output_prefix=s3_error_output_prefix,
101 |         prefix=s3_output_prefix,
102 |       ),
103 |       buffering_hints={
104 |         "intervalInSeconds": firehose_buffer_interval,
105 |         "sizeInMBs": firehose_buffer_size
106 |       },
107 |       cloud_watch_logging_options={
108 |         "enabled": True,
109 |         "logGroupName": firehose_log_group_name,
110 |         "logStreamName": "DestinationDelivery"
111 |       },
112 |       destination_table_configuration_list=[
113 |         cfn_delivery_stream.DestinationTableConfigurationProperty(
114 |           destination_database_name=dest_iceberg_table_config["database_name"],
115 |           destination_table_name=dest_iceberg_table_config["table_name"],
116 |           unique_keys=dest_iceberg_table_unique_keys
117 |         )
118 |       ],
119 |       processing_configuration=firehose_processing_config,
120 |       s3_backup_mode='FailedDataOnly'
121 |     )
122 | 
123 |     delivery_stream = aws_kinesisfirehose.CfnDeliveryStream(self, "FirehoseToIceberg",
124 |       delivery_stream_name=delivery_stream_name,
125 |       delivery_stream_type="DirectPut",
126 |       iceberg_destination_configuration=iceberg_dest_config,
127 |       tags=[{"key": "Name", "value": delivery_stream_name}]
128 |     )
129 | 
130 | 
131 |     cdk.CfnOutput(self, 'S3DestBucket',
132 |       value=s3_bucket.bucket_name,
133 |       export_name=f'{self.stack_name}-S3DestBucket')
134 |     cdk.CfnOutput(self, 'DataFirehoseStreamName',
135 |       value=delivery_stream.delivery_stream_name,
136 |       export_name=f'{self.stack_name}-FirehoseStreamName')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/lake_formation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lakeformation
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class DataLakePermissionsStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, firehose_role, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
20 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
21 |     database_name=dest_iceberg_table_config["database_name"]
22 | 
23 |     #XXXX: The role assumed by cdk is not a data lake administrator.
24 |     # So, deploying PrincipalPermissions meets the error such as:
25 |     # "Resource does not exist or requester is not authorized to access requested permissions."
26 |     # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
27 |     # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
28 |     cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
29 |       admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
30 |         data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
31 |       )]
32 |     )
33 | 
34 |     cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions",
35 |       permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
36 |       permissions_with_grant_option=[],
37 |       principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
38 |         data_lake_principal_identifier=firehose_role.role_arn
39 |       ),
40 |       resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
41 |         #XXX: Can't specify a TableWithColumns resource and a Table resource
42 |         table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
43 |           catalog_id=cdk.Aws.ACCOUNT_ID,
44 |           database_name=database_name,
45 |           # name="ALL_TABLES",
46 |           table_wildcard={}
47 |         )
48 |       )
49 |     )
50 |     cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
51 | 
52 |     #XXX: In order to keep resource destruction order,
53 |     # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
54 |     cfn_principal_permissions.add_dependency(cfn_data_lake_settings)
55 | 
56 | 
57 |     cdk.CfnOutput(self, 'Principal',
58 |       value=cfn_principal_permissions.attr_principal_identifier,
59 |       export_name=f'{self.stack_name}-Principal')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/cdk_stacks/s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_s3 as s3
10 | )
11 | 
12 | from constructs import Construct
13 | 
14 | 
15 | class S3BucketStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     S3_DEFAULT_BUCKET_NAME = f"web-analytics-{self.region}-{self.account}"
21 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
22 |     s3_bucket_name = data_firehose_configuration.get('s3_bucket_name', S3_DEFAULT_BUCKET_NAME)
23 | 
24 |     self.s3_bucket = s3.Bucket(self, "s3bucket",
25 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
26 |       bucket_name=s3_bucket_name)
27 | 
28 | 
29 |     cdk.CfnOutput(self, 'S3BucketName',
30 |       value=self.s3_bucket.bucket_name,
31 |       export_name=f'{self.stack_name}-S3BucketName')


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.24.41
2 | mimesis==18.0.0
3 | requests>=2.31.0
4 | 
5 | # packages for Lambda Layer
6 | fastavro==1.10.0
7 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.185.0
2 | constructs>=10.0.0,<11.0.0
3 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/src/main/python/IcebergTransformer/firehose_to_iceberg_transformer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import base64
  6 | import collections
  7 | import json
  8 | import logging
  9 | import os
 10 | from datetime import datetime
 11 | 
 12 | import fastavro
 13 | 
 14 | 
 15 | LOGGER = logging.getLogger()
 16 | if len(LOGGER.handlers) > 0:
 17 |   # The Lambda environment pre-configures a handler logging to stderr.
 18 |   # If a handler is already configured, `.basicConfig` does not execute.
 19 |   # Thus we set the level directly.
 20 |   LOGGER.setLevel(logging.INFO)
 21 | else:
 22 |   logging.basicConfig(level=logging.INFO)
 23 | 
 24 | 
 25 | DESTINATION_DATABASE_NAME = os.environ['IcebergDatabaseName']
 26 | DESTINATION_TABLE_NAME = os.environ['IcebergTableName']
 27 | DESTINATION_TABLE_UNIQUE_KEYS = os.environ.get('IcebergTableUniqueKeys', None)
 28 | 
 29 | ORIGINAL_SCHEMA = {
 30 |   'name': 'WebLogs',
 31 |   'type': 'record',
 32 |   'fields': [
 33 |     {
 34 |       'name': 'user_id',
 35 |       'type': 'string'
 36 |     },
 37 |     {
 38 |       'name': 'session_id',
 39 |       'type': 'string'
 40 |     },
 41 |     {
 42 |       'name': 'event',
 43 |       'type': 'string'
 44 |     },
 45 |     {
 46 |       'name': 'referrer',
 47 |       'type': ['string', 'null']
 48 |     },
 49 |     {
 50 |       'name': 'user_agent',
 51 |       'type': ['string', 'null']
 52 |     },
 53 |     {
 54 |       'name': 'ip',
 55 |       'type': 'string'
 56 |     },
 57 |     {
 58 |       'name': 'hostname',
 59 |       'type': 'string'
 60 |     },
 61 |     {
 62 |       'name': 'os',
 63 |       'type': ['string', 'null']
 64 |     },
 65 |     {
 66 |       'name': 'timestamp',
 67 |       'type': {
 68 |         'type': 'string',
 69 |         'logicalType': 'datetime'
 70 |       }
 71 |     },
 72 |     {
 73 |       'name': 'uri',
 74 |       'type': 'string'
 75 |     }
 76 |   ]
 77 | }
 78 | 
 79 | 
 80 | def read_datetime(data, writer_schema=None, reader_schema=None):
 81 |   return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 82 | 
 83 | 
 84 | def prepare_datetime(data, schema):
 85 |   """Converts datetime.datetime to string representing the date and time"""
 86 |   if isinstance(data, datetime):
 87 |     return datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
 88 |   else:
 89 |     try:
 90 |       dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 91 |       return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
 92 |     except Exception as ex:
 93 |       return None
 94 | 
 95 | 
 96 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime
 97 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime
 98 | 
 99 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA)
100 | 
101 | 
102 | def check_schema(record):
103 |   try:
104 |     return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False)
105 |   except Exception as ex:
106 |     LOGGER.error(ex)
107 |     return False
108 | 
109 | 
110 | def lambda_handler(event, context):
111 |   counter = collections.Counter(total=0, valid=0, invalid=0)
112 |   firehose_records_output = {'records': []}
113 | 
114 |   unique_keys_exist = True if DESTINATION_TABLE_UNIQUE_KEYS else False
115 |   otf_metadata_operation = 'insert' if not unique_keys_exist else 'update'
116 | 
117 |   for record in event['records']:
118 |     counter['total'] += 1
119 | 
120 |     payload = base64.b64decode(record['data']).decode('utf-8')
121 |     json_value = json.loads(payload)
122 | 
123 |     #XXX: check if schema is valid
124 |     is_valid = check_schema(json_value)
125 |     counter['valid' if is_valid else 'invalid'] += 1
126 | 
127 |     firehose_record = {
128 |       'data': base64.b64encode(payload.encode('utf-8')),
129 |       'recordId': record['recordId'],
130 |       'result': 'Ok' if is_valid else 'ProcessingFailed', # [Ok, Dropped, ProcessingFailed]
131 |       'metadata': {
132 |         'otfMetadata': {
133 |           'destinationDatabaseName': DESTINATION_DATABASE_NAME,
134 |           'destinationTableName': DESTINATION_TABLE_NAME,
135 |           'operation': otf_metadata_operation
136 |         }
137 |       }
138 |     }
139 | 
140 |     firehose_records_output['records'].append(firehose_record)
141 | 
142 |   LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items()))
143 | 
144 |   return firehose_records_output
145 | 
146 | 
147 | if __name__ == '__main__':
148 |   import pprint
149 | 
150 |   record_list = [
151 |     ('Ok', {
152 |       "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720",
153 |       "session_id": "a5aa20a72c9e37588f9bbeaa",
154 |       "event": "view",
155 |       "referrer": "brandon.biz",
156 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
157 |       "ip": "202.165.71.49",
158 |       "hostname": "toxic.tokyo",
159 |       "os": "openSUSE",
160 |       "timestamp": "2022-09-16T07:35:46Z",
161 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
162 |     }),
163 |     ('Ok', {
164 |       "user_id": "70b1f606-aa63-47fb-bc92-76de9c59d064",
165 |       "session_id": "928e78473db8449b17644b2c",
166 |       "event": "like",
167 |       # missing optional data
168 |       # "referrer": "toe.gq",
169 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53",
170 |       "ip": "12.166.113.176",
171 |       "hostname": "drivers.glass",
172 |       "os": "Windows 8.1",
173 |       "timestamp": "2022-09-16T07:52:47Z",
174 |       "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members"
175 |     }),
176 |     ('ProcessingFailed', {
177 |       "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720",
178 |       "session_id": "a5aa20a72c9e37588f9bbeaa",
179 |       "event": "cart",
180 |       "referrer": "brandon.biz",
181 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
182 |       "ip": "202.165.71.49",
183 |       "hostname": "toxic.tokyo",
184 |       "os": "openSUSE",
185 |       # invalid datetime format
186 |       "timestamp": "2022-09-16 07:35:46",
187 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
188 |     }),
189 |     ('ProcessingFailed', {
190 |       # missing required data
191 |       # "user_id": "045e63c7-b276-4117-9706-7c2e3b87d5f5",
192 |       "session_id": "abfd47eb7dd7b8aeec0555a7",
193 |       "event": "purchase",
194 |       "referrer": "transfer.edu",
195 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50",
196 |       "ip": "170.128.148.234",
197 |       "hostname": "propecia.tc",
198 |       "os": "Lubuntu",
199 |       "timestamp": "2022-09-16T07:46:04Z",
200 |       "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel"
201 |     }),
202 |     ('ProcessingFailed', {
203 |       "user_id": "e504cd9d-30da-497f-8f28-2b3f64220e16",
204 |       "session_id": "fd4807ab825ee8bd950b1e8b",
205 |       "event": "list",
206 |       "referrer": "liquid.aquitaine",
207 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02",
208 |       # mismatched data type
209 |       "ip": 212234672,
210 |       "hostname": "consequently.com",
211 |       "os": "Gentoo",
212 |       "timestamp": "2022-09-16T07:13:29Z",
213 |       "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook"
214 |     }),
215 |     ('ProcessingFailed', {
216 |       # mismatched column name
217 |       "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720",
218 |       # mismatched column name
219 |       "sessionId": "a5aa20a72c9e37588f9bbeaa",
220 |       "event": "visit",
221 |       "referrer": "brandon.biz",
222 |       # mismatched column name
223 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
224 |       "ip": "202.165.71.49",
225 |       "hostname": "toxic.tokyo",
226 |       "os": "openSUSE",
227 |       "timestamp": "2022-09-16T07:35:46Z",
228 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
229 |     })
230 |   ]
231 | 
232 |   for correct_result, record in record_list:
233 |     event = {
234 |       "invocationId": "invocationIdExample",
235 |       "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE",
236 |       "region": "us-east-1",
237 |       "records": [
238 |         {
239 |           "recordId": "49546986683135544286507457936321625675700192471156785154",
240 |           "approximateArrivalTimestamp": 1495072949453,
241 |           "data": base64.b64encode(json.dumps(record).encode('utf-8'))
242 |         }
243 |       ]
244 |     }
245 | 
246 |     res = lambda_handler(event, {})
247 |     print(f"\n>> {correct_result} == {res['records'][0]['result']}?",  res['records'][0]['result'] == correct_result)
248 |     pprint.pprint(res)
249 | 


--------------------------------------------------------------------------------
/web-analytics-datafirehose-iceberg/src/utils/gen_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import sys
 6 | import argparse
 7 | from datetime import (
 8 |   datetime,
 9 |   timezone
10 | )
11 | import json
12 | import time
13 | import typing
14 | 
15 | from mimesis.locales import Locale
16 | from mimesis.schema import Field, Schema
17 | from mimesis.providers.base import BaseProvider
18 | import requests
19 | 
20 | 
21 | class CustomDatetime(BaseProvider):
22 |   class Meta:
23 |     """Class for metadata."""
24 |     name: typing.Final[str] = "custom_datetime"
25 | 
26 |   def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None:
27 |     super().__init__(*args, **kwargs)
28 | 
29 |   def timestamp(self) -> str:
30 |     utc_now = datetime.now(timezone.utc)
31 |     minute = self.random.randint(0, 59)
32 |     second = self.random.randint(0, 59)
33 |     random_datetime = utc_now.replace(minute=minute, second=second)
34 |     return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
35 | 
36 | 
37 | def main():
38 |   parser = argparse.ArgumentParser()
39 | 
40 |   parser.add_argument('--api-url', help='log collector api url')
41 |   parser.add_argument('--api-method', default='records', choices=['record', 'records'],
42 |     help='log collector api method [record | records]')
43 |   parser.add_argument('--stream-name', help='kinesis stream name')
44 |   parser.add_argument('--max-count', default=15, type=int, help='max number of records to put')
45 |   parser.add_argument('--dry-run', action='store_true')
46 | 
47 |   options = parser.parse_args()
48 | 
49 |   _field = Field(locale=Locale.EN)
50 |   _field._generic.add_provider(CustomDatetime)
51 | 
52 |   schema_definition = lambda: {
53 |     "user_id": _field("uuid"),
54 |     "session_id": _field("token_hex", entropy=12),
55 |     "event": _field("choice", items=['visit', 'view', 'list', 'like', 'cart', 'purchase']),
56 |     "referrer": _field("internet.hostname"),
57 |     "user_agent": _field("internet.user_agent"),
58 |     "ip": _field("internet.ip_v4"),
59 |     "hostname": _field("internet.hostname"),
60 |     "os": _field("development.os"),
61 |     "timestamp": _field("custom_datetime.timestamp"),
62 |     "uri": _field("internet.uri", query_params_count=2)
63 |   }
64 |   schema = Schema(schema=schema_definition, iterations=options.max_count)
65 | 
66 |   log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None
67 | 
68 |   for record in schema:
69 |     if options.dry_run:
70 |       print(json.dumps(record), file=sys.stderr)
71 |       continue
72 | 
73 |     if options.api_method == 'record':
74 |       data = {'Data': record}
75 |       payload = f'{json.dumps(data)}'
76 |     else:
77 |       #XXX: make sure data has newline
78 |       data = {"records":[{'data': f'{json.dumps(record)}\n'}]}
79 |       payload = json.dumps(data)
80 | 
81 |     res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'})
82 |     if res.status_code == 200:
83 |       print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr)
84 |     else:
85 |       print(f'[{res.status_code} {res.reason}]', file=sys.stderr)
86 |       sys.exit(1)
87 |     time.sleep(0.5)
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/.example.cdk.context.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "acknowledged-issue-numbers": [
 3 |     32775
 4 |   ],
 5 |   "firehose_data_tranform_lambda": {
 6 |     "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources",
 7 |     "s3_object_key": "var/fastavro-lib.zip"
 8 |   },
 9 |   "data_firehose_configuration": {
10 |     "buffering_hints": {
11 |       "interval_in_seconds": 60,
12 |       "size_in_mbs": 128
13 |     },
14 |     "transform_records_with_aws_lambda": {
15 |       "buffer_size": 3,
16 |       "buffer_interval": 300,
17 |       "number_of_retries": 3
18 |     },
19 |     "destination_iceberg_table_configuration": {
20 |       "database_name": "web_log_iceberg_db",
21 |       "table_name": "web_log_iceberg",
22 |       "unique_keys": [
23 |         "user_id", "timestamp"
24 |       ]
25 |     },
26 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
27 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | package-lock.json
 4 | __pycache__
 5 | .pytest_cache
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Web Log Analytics with Amazon Kinesis Data Streams Proxy using Amazon API Gateway
  3 | 
  4 | This repository provides you cdk scripts and sample code on how to implement a simple [web analytics](https://en.wikipedia.org/wiki/Web_analytics) system.<br/>
  5 | Below diagram shows what we are implementing.
  6 | 
  7 | ![web-analytics-arch](web-analytics-iceberg-arch.svg)
  8 | 
  9 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 10 | 
 11 | This project is set up like a standard Python project.  The initialization
 12 | process also creates a virtualenv within this project, stored under the `.venv`
 13 | directory.  To create the virtualenv it assumes that there is a `python3`
 14 | (or `python` for Windows) executable in your path with access to the `venv`
 15 | package. If for any reason the automatic creation of the virtualenv fails,
 16 | you can create the virtualenv manually.
 17 | 
 18 | To manually create a virtualenv on MacOS and Linux:
 19 | 
 20 | ```
 21 | $ python3 -m venv .venv
 22 | ```
 23 | 
 24 | After the init process completes and the virtualenv is created, you can use the following
 25 | step to activate your virtualenv.
 26 | 
 27 | ```
 28 | $ source .venv/bin/activate
 29 | ```
 30 | 
 31 | If you are a Windows platform, you would activate the virtualenv like this:
 32 | 
 33 | ```
 34 | % .venv\Scripts\activate.bat
 35 | ```
 36 | 
 37 | Once the virtualenv is activated, you can install the required dependencies.
 38 | 
 39 | ```
 40 | (.venv) $ pip install -r requirements.txt
 41 | ```
 42 | 
 43 | To add additional dependencies, for example other CDK libraries, just add
 44 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
 45 | command.
 46 | 
 47 | ### Upload Lambda Layer code
 48 | 
 49 | Before deployment, you should uplad zipped code files to s3 like this:
 50 | <pre>
 51 | (.venv) $ aws s3api create-bucket --bucket <i>your-s3-bucket-name-for-lambda-layer-code</i> --region <i>region-name</i>
 52 | (.venv) $ ./build-aws-lambda-layer-package.sh <i>your-s3-bucket-name-for-lambda-layer-code</i>
 53 | </pre>
 54 | 
 55 | > :warning: To create a bucket outside of the `us-east-1` region, `aws s3api create-bucket` command requires the appropriate **LocationConstraint** to be specified in order to create the bucket in the desired region. For more information, see these [examples](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3api/create-bucket.html#examples).
 56 | 
 57 | > :warning: Make sure you have **Docker** installed.
 58 | 
 59 | For example,
 60 | <pre>
 61 | (.venv) $ aws s3api create-bucket --bucket lambda-layer-resources --region <i>us-east-1</i>
 62 | (.venv) $ ./build-aws-lambda-layer-package.sh lambda-layer-resources
 63 | </pre>
 64 | 
 65 | For more information about how to create a package for Amazon Lambda Layer, see [here](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/).
 66 | 
 67 | ### Deploy
 68 | 
 69 | Before to synthesize the CloudFormation template for this code, you should update `cdk.context.json` file.<br/>
 70 | In particular, you need to fill the s3 location of the previously created lambda lay codes.
 71 | 
 72 | For example,
 73 | <pre>
 74 | {
 75 |   "firehose_data_tranform_lambda": {
 76 |     "s3_bucket_name": "<i>lambda-layer-resources</i>",
 77 |     "s3_object_key": "<i>var/fastavro-lib.zip</i>"
 78 |   },
 79 |   "data_firehose_configuration": {
 80 |     "buffering_hints": {
 81 |       "interval_in_seconds": 60,
 82 |       "size_in_mbs": 128
 83 |     },
 84 |     "transform_records_with_aws_lambda": {
 85 |       "buffer_size": 3,
 86 |       "buffer_interval": 300,
 87 |       "number_of_retries": 3
 88 |     },
 89 |     "destination_iceberg_table_configuration": {
 90 |       "database_name": "web_log_iceberg_db",
 91 |       "table_name": "web_log_iceberg"
 92 |     },
 93 |     "output_prefix": "web_log_iceberg_db/web_log_iceberg",
 94 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
 95 |   }
 96 | }
 97 | </pre>
 98 | :information_source: `database_name`, and `table_name` of `data_firehose_configuration.destination_iceberg_table_configuration` is used in [**Set up Delivery Stream**](#set-up-delivery-stream) step.
 99 | 
100 | :information_source: When updating or deleting records in an Iceberg table, specify the table's primary key column name as `unique_keys` in the `data_firehose_configuration.destination_iceberg_table_configuration` settings.
101 | For example,
102 | <pre>
103 | "destination_iceberg_table_configuration": {
104 |   "database_name": "web_log_iceberg_db",
105 |   "table_name": "web_log_iceberg",
106 |   "unique_keys": [
107 |     "user_id", "timestamp"
108 |   ]
109 | }
110 | </pre>
111 | 
112 | 
113 | Now you are ready to synthesize the CloudFormation template for this code.<br/>
114 | 
115 | <pre>
116 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
117 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region)
118 | (.venv) $ cdk synth --all
119 | </pre>
120 | 
121 | Now let's try to deploy.
122 | 
123 | ## List all CDK Stacks
124 | 
125 | ```
126 | (.venv) $ cdk list
127 | WebAnalyticsKdsProxyApiGw
128 | WebAnalyticsKinesisStream
129 | WebAnalyticsDataFirehoseToIcebergS3Path
130 | WebAnalyticsFirehoseDataTransformLambdaStack
131 | WebAnalyticsFirehoseToIcebergRoleStack
132 | WebAnalyticsGrantLFPermissionsOnFirehoseRole
133 | WebAnalyticsFirehoseToIcebergStack
134 | ```
135 | 
136 | Use `cdk deploy` command to create the stack shown above.
137 | 
138 | ## Create API endpoint for web data collection
139 | 
140 | <pre>
141 | (.venv) $ cdk deploy --require-approval never \
142 |               WebAnalyticsKdsProxyApiGw \
143 |               WebAnalyticsKinesisStream
144 | </pre>
145 | 
146 | ## Set up Delivery Stream
147 | 
148 | 1. Create a S3 bucket for Apache Iceberg table
149 |    <pre>
150 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsDataFirehoseToIcebergS3Path
151 |    </pre>
152 | 2. Create a table with partitioned data in Amazon Athena
153 | 
154 |    Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.<br/>
155 |    * (step 1) Create a database
156 | 
157 |       In order to create a new database called `web_log_iceberg_db`, enter the following statement in the Athena query editor and click the **Run** button to execute the query.
158 | 
159 |       <pre>
160 |       CREATE DATABASE IF NOT EXISTS web_log_iceberg_db;
161 |       </pre>
162 | 
163 |    * (step 2) Create a table
164 | 
165 |       Copy the following query into the Athena query editor.
166 | 
167 |       Update `LOCATION` to your S3 bucket name and execute the query to create a new table.
168 |       <pre>
169 |       CREATE TABLE web_log_iceberg_db.web_log_iceberg (
170 |         `user_id` string,
171 |         `session_id` string,
172 |         `event` string,
173 |         `referrer` string,
174 |         `user_agent` string,
175 |         `ip` string,
176 |         `hostname` string,
177 |         `os` string,
178 |         `timestamp` timestamp,
179 |         `uri` string
180 |       )
181 |       PARTITIONED BY (event)
182 |       LOCATION 's3://web-analytics-<i>{region}</i>-</i>{account_id}</i>/web_log_iceberg_db/web_log_iceberg'
183 |       TBLPROPERTIES (
184 |         'table_type'='iceberg',
185 |         'format'='parquet',
186 |         'write_compression'='snappy',
187 |         'optimize_rewrite_delete_file_threshold'='10'
188 |       );
189 |       </pre>
190 |       If the query is successful, a table named `web_log_iceberg` is created and displayed on the left panel under the **Tables** section.
191 | 
192 |       If you get an error, check if (a) you have updated the `LOCATION` to the correct S3 bucket name, (b) you have `web_log_iceberg_db` selected under the Database dropdown, and (c) you have `AwsDataCatalog` selected as the **Data source**.
193 | 3. Create a lambda function to process the streaming data.
194 |    <pre>
195 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseDataTransformLambdaStack
196 |    </pre>
197 | 4. To allow Data Firehose to ingest data into the Apache Iceberg table, create an IAM role and grant permissions to the role.
198 |    <pre>
199 |    (.venv) $ cdk deploy --require-approval never \
200 |                  WebAnalyticsFirehoseToIcebergRoleStack \
201 |                  WebAnalyticsGrantLFPermissionsOnFirehoseRole
202 |    </pre>
203 | 
204 |    :information_source: If you fail to create the table, give Athena users access permissions on `web_log_iceberg_db` through [AWS Lake Formation](https://console.aws.amazon.com/lakeformation/home), or you can grant Amazon Data Firehose to access `web_log_iceberg_db` by running the following command:
205 |    <pre>
206 |    (.venv) $ aws lakeformation grant-permissions \
207 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>role-id</i> \
208 |                  --permissions CREATE_TABLE DESCRIBE ALTER DROP \
209 |                  --resource '{ "Database": { "Name": "<i>web_log_iceberg_db</i>" } }'
210 |    (.venv) $ aws lakeformation grant-permissions \
211 |                  --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>role-id</i> \
212 |                  --permissions SELECT DESCRIBE ALTER INSERT DELETE DROP \
213 |                  --resource '{ "Table": {"DatabaseName": "<i>web_log_iceberg_db</i>", "TableWildcard": {}} }'
214 |    </pre>
215 | 5. Deploy Amazon Data Firehose.
216 |    <pre>
217 |    (.venv) $ cdk deploy --require-approval never WebAnalyticsFirehoseToIcebergStack
218 |    </pre>
219 | 
220 | ## Run Test
221 | 
222 | 1. Run `GET /streams` method to invoke `ListStreams` in Kinesis
223 |    <pre>
224 |    $ curl -X GET https://<i>your-api-gateway-id</i>.execute-api.us-east-1.amazonaws.com/v1/streams
225 |    </pre>
226 | 
227 |    The response is:
228 |    <pre>
229 |    {
230 |      "HasMoreStreams": false,
231 |      "StreamNames": [
232 |        "PUT-Firehose-aEhWz"
233 |      ],
234 |      "StreamSummaries": [
235 |        {
236 |          "StreamARN": "arn:aws:kinesis:us-east-1:123456789012:stream/PUT-Firehose-aEhWz",
237 |          "StreamCreationTimestamp": 1661612556,
238 |          "StreamModeDetails": {
239 |            "StreamMode": "ON_DEMAND"
240 |          },
241 |          "StreamName": "PUT-Firehose-aEhWz",
242 |          "StreamStatus": "ACTIVE"
243 |        }
244 |      ]
245 |    }
246 |    </pre>
247 | 
248 | 2. Generate test data.
249 |    <pre>
250 |    (.venv) $ pip install -r requirements-dev.txt
251 |    (.venv) $ python src/utils/gen_fake_data.py --max-count 5 --stream-name <i>PUT-Firehose-aEhWz</i> --api-url 'https://<i>your-api-gateway-id</i>.execute-api.us-east-1.amazonaws.com/v1' --api-method records
252 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260289903462649185194773668901646666226496176178","ShardId":"shardId-000000000003"}]}
253 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260289903462649185194774877827466280924390359090","ShardId":"shardId-000000000003"}]}
254 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260223001227053593325351479598467950537766600706","ShardId":"shardId-000000000000"}]}
255 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260245301972252123948494224242560213528447287314","ShardId":"shardId-000000000001"}]}
256 |    [200 OK] {"EncryptionType":"KMS","FailedRecordCount":0,"Records":[{"SequenceNumber":"49633315260223001227053593325353897450107179933554966530","ShardId":"shardId-000000000000"}]}
257 |    </pre>
258 | 
259 | 3. Check streaming data in S3
260 | 
261 |    After `5~10` minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3**.
262 | 
263 |    ![iceberg-table](./assets/wa-iceberg-table.png)
264 |    ![iceberg-table-data-level-01](./assets/wa-iceberg-data-level-01.png)
265 |    ![iceberg-table-data-level-02](./assets/wa-iceberg-data-level-02.png)
266 |    ![iceberg-table-data-level-03](./assets/wa-iceberg-data-level-03.png)
267 | 
268 | 4. Run test query using Amazon Athena
269 | 
270 |    Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.
271 | 
272 |    * (Step 1) Specify the workgroup to use
273 | 
274 |      To run queries, switch to the appropriate workgroup like this:
275 |       ![amazon-athena-switching-to-workgroup](./assets/amazon-athena-switching-to-workgroup.png)
276 | 
277 |    * (Step 2) Run test query
278 | 
279 |      Enter the following SQL statement and execute the query.
280 |      <pre>
281 |      SELECT COUNT(*)
282 |      FROM web_log_iceberg_db.web_log_iceberg;
283 |      </pre>
284 | 
285 | ## Clean Up
286 | 
287 | Delete the CloudFormation stack by running the below command.
288 | <pre>
289 | (.venv) $ cdk destroy --force --all
290 | </pre>
291 | 
292 | 
293 | ## Useful commands
294 | 
295 |  * `cdk ls`          list all stacks in the app
296 |  * `cdk synth`       emits the synthesized CloudFormation template
297 |  * `cdk deploy`      deploy this stack to your default AWS account/region
298 |  * `cdk diff`        compare deployed stack with current state
299 |  * `cdk docs`        open CDK documentation
300 | 
301 | Enjoy!
302 | 
303 | ## References
304 | 
305 |  * [Web Analytics](https://en.wikipedia.org/wiki/Web_analytics)
306 |  * [Tutorial: Create a REST API as an Amazon Kinesis proxy in API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html)
307 |  * [Streaming Data Solution for Amazon Kinesis](https://aws.amazon.com/ko/solutions/implementations/aws-streaming-data-solution-for-amazon-kinesis/)
308 |    <div>
309 |      <img src="https://d1.awsstatic.com/Solutions/Solutions%20Category%20Template%20Draft/Solution%20Architecture%20Diagrams/aws-streaming-data-using-api-gateway-architecture.1b9d28f061fe84385cb871ec58ccad18c7265d22.png", alt with="385" height="204">
310 |    </div>
311 |  * [(AWS Developer Guide) Deliver data to Apache Iceberg Tables with Amazon Data Firehose](https://docs.aws.amazon.com/firehose/latest/dev/apache-iceberg-destination.html)
312 |  * [Building fine-grained authorization using Amazon Cognito, API Gateway, and IAM](https://aws.amazon.com/ko/blogs/security/building-fine-grained-authorization-using-amazon-cognito-api-gateway-and-iam/)
313 |  * [AWS Lake Formation - Create a data lake administrator](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin)
314 |  * [AWS Lake Formation Permissions Reference](https://docs.aws.amazon.com/lake-formation/latest/dg/lf-permissions-reference.html)
315 |  * [Amazon Athena Using Iceberg tables](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html)
316 |  * [Amazon Athena Workshop](https://athena-in-action.workshop.aws/)
317 |  * [Curl Cookbook](https://catonmat.net/cookbooks/curl)
318 |  * [fastavro](https://fastavro.readthedocs.io/) - Fast read/write of `AVRO` files
319 |  * [Apache Avro Specification](https://avro.apache.org/docs/current/spec.html)
320 |  * [How to create a Lambda layer using a simulated Lambda environment with Docker](https://aws.amazon.com/premiumsupport/knowledge-center/lambda-layer-simulated-docker/)
321 |    ```
322 |    $ cat <<EOF > requirements-Lambda-Layer.txt
323 |    > fastavro==1.6.1
324 |    > EOF
325 |    $ docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install -r requirements-Lambda-Layer.txt -t python/lib/python3.11/site-packages/; exit"
326 |    $ zip -r fastavro-lib.zip python > /dev/null
327 |    $ aws s3 mb s3://my-bucket-for-lambda-layer-packages
328 |    $ aws s3 cp fastavro-lib.zip s3://my-bucket-for-lambda-layer-packages/
329 |    ```
330 | 
331 | ## Security
332 | 
333 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
334 | 
335 | ## License
336 | 
337 | This library is licensed under the MIT-0 License. See the LICENSE file.
338 | 
339 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import aws_cdk as cdk
 5 | 
 6 | from cdk_stacks import (
 7 |   KdsProxyApiGwStack,
 8 |   KdsStack,
 9 |   FirehoseToIcebergStack,
10 |   FirehoseRoleStack,
11 |   FirehoseDataProcLambdaStack,
12 |   DataLakePermissionsStack,
13 |   S3BucketStack,
14 | )
15 | 
16 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
17 |   region=os.getenv('CDK_DEFAULT_REGION'))
18 | 
19 | app = cdk.App()
20 | 
21 | kds_proxy_apigw = KdsProxyApiGwStack(app, 'WebAnalyticsKdsProxyApiGw')
22 | kds_stack = KdsStack(app, 'WebAnalyticsKinesisStream')
23 | 
24 | s3_dest_bucket = S3BucketStack(app, 'WebAnalyticsDataFirehoseToIcebergS3Path',
25 |   env=AWS_ENV)
26 | s3_dest_bucket.add_dependency(kds_stack)
27 | 
28 | firehose_data_transform_lambda = FirehoseDataProcLambdaStack(app,
29 |   'WebAnalyticsFirehoseDataTransformLambdaStack',
30 |   env=AWS_ENV
31 | )
32 | firehose_data_transform_lambda.add_dependency(s3_dest_bucket)
33 | 
34 | firehose_role = FirehoseRoleStack(app, 'WebAnalyticsFirehoseToIcebergRoleStack',
35 |   firehose_data_transform_lambda.data_proc_lambda_fn,
36 |   kds_stack.kinesis_stream,
37 |   s3_dest_bucket.s3_bucket,
38 |   env=AWS_ENV
39 | )
40 | firehose_role.add_dependency(firehose_data_transform_lambda)
41 | 
42 | grant_lake_formation_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnFirehoseRole',
43 |   firehose_role.firehose_role,
44 |   env=AWS_ENV
45 | )
46 | grant_lake_formation_permissions.add_dependency(firehose_role)
47 | 
48 | firehose_stack = FirehoseToIcebergStack(app, 'WebAnalyticsFirehoseToIcebergStack',
49 |   firehose_data_transform_lambda.data_proc_lambda_fn,
50 |   kds_stack.kinesis_stream,
51 |   s3_dest_bucket.s3_bucket,
52 |   firehose_role.firehose_role,
53 |   env=AWS_ENV
54 | )
55 | firehose_stack.add_dependency(grant_lake_formation_permissions)
56 | 
57 | app.synth()
58 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/assets/amazon-athena-switching-to-workgroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/amazon-athena-switching-to-workgroup.png


--------------------------------------------------------------------------------
/web-analytics-iceberg/assets/wa-iceberg-data-level-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-01.png


--------------------------------------------------------------------------------
/web-analytics-iceberg/assets/wa-iceberg-data-level-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-02.png


--------------------------------------------------------------------------------
/web-analytics-iceberg/assets/wa-iceberg-data-level-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-data-level-03.png


--------------------------------------------------------------------------------
/web-analytics-iceberg/assets/wa-iceberg-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-iceberg/assets/wa-iceberg-table.png


--------------------------------------------------------------------------------
/web-analytics-iceberg/build-aws-lambda-layer-package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -
 2 | 
 3 | VERSION=1.10.0
 4 | PY_VERSION=3.11
 5 | LAMBDA_LAYER_NAME=fastavro-lib-${VERSION}-py-${PY_VERSION}
 6 | S3_PATH=$1
 7 | 
 8 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.11" /bin/sh -c "pip install fastavro==${VERSION} -t python/lib/python3.11/site-packages/; exit"
 9 | 
10 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null
11 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip
12 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip"
13 | 
14 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ]
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .apigw import KdsProxyApiGwStack
2 | from .firehose_to_iceberg import FirehoseToIcebergStack
3 | from .firehose_role import FirehoseRoleStack
4 | from .firehose_data_proc_lambda import FirehoseDataProcLambdaStack
5 | from .kds import KdsStack
6 | from .lake_formation import DataLakePermissionsStack
7 | from .s3 import S3BucketStack


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/apigw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import json
  6 | 
  7 | import aws_cdk as cdk
  8 | 
  9 | from aws_cdk import (
 10 |   Stack,
 11 |   aws_apigateway,
 12 |   aws_iam,
 13 | )
 14 | from constructs import Construct
 15 | 
 16 | 
 17 | class KdsProxyApiGwStack(Stack):
 18 | 
 19 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
 20 |     super().__init__(scope, construct_id, **kwargs)
 21 | 
 22 |     apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument()
 23 |     apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 24 |       "effect": aws_iam.Effect.ALLOW,
 25 |       "resources": ["*"],
 26 |       "actions": [
 27 |         "kinesis:DescribeStream",
 28 |         "kinesis:PutRecord",
 29 |         "kinesis:PutRecords"]
 30 |     }))
 31 | 
 32 |     apigw_kds_role = aws_iam.Role(self, "APIGatewayRoleToAccessKinesisDataStreams",
 33 |       role_name='APIGatewayRoleToAccessKinesisDataStreams',
 34 |       assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'),
 35 |       inline_policies={
 36 |         'KinesisWriteAccess': apigw_kds_access_role_policy_doc
 37 |       },
 38 |       managed_policies=[
 39 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess')
 40 |       ]
 41 |     )
 42 | 
 43 |     #XXX: Start to create an API as a Kinesis proxy
 44 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy
 45 |     kds_proxy_api = aws_apigateway.RestApi(self, "KdsProxyAPI",
 46 |       rest_api_name="log-collector",
 47 |       description="An Amazon API Gateway REST API that integrated with an Amazon Kinesis Data Streams.",
 48 |       endpoint_types=[aws_apigateway.EndpointType.REGIONAL],
 49 |       default_cors_preflight_options={
 50 |         "allow_origins": aws_apigateway.Cors.ALL_ORIGINS
 51 |       },
 52 |       deploy=True,
 53 |       deploy_options=aws_apigateway.StageOptions(stage_name="v1"),
 54 |       endpoint_export_name="KdsProxyAPIEndpoint"
 55 |     )
 56 | 
 57 |     apigw_error_responses = [
 58 |       aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"),
 59 |       aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}")
 60 |     ]
 61 | 
 62 |     #XXX: GET /streams
 63 |     # List Kinesis streams by using the API Gateway console
 64 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams
 65 | 
 66 |     streams_resource = kds_proxy_api.root.add_resource("streams")
 67 | 
 68 |     list_streams_options = aws_apigateway.IntegrationOptions(
 69 |       credentials_role=apigw_kds_role,
 70 |       integration_responses=[
 71 |         aws_apigateway.IntegrationResponse(
 72 |           status_code="200"
 73 |         ),
 74 |         *apigw_error_responses
 75 |       ],
 76 |       request_templates={
 77 |         'application/json': '{}'
 78 |       },
 79 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
 80 |     )
 81 | 
 82 |     list_streams_integration = aws_apigateway.AwsIntegration(
 83 |       service='kinesis',
 84 |       action='ListStreams',
 85 |       integration_http_method='POST',
 86 |       options=list_streams_options
 87 |     )
 88 | 
 89 |     streams_resource.add_method("GET", list_streams_integration,
 90 |       # Default `authorization_type`: - open access unless `authorizer` is specified
 91 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
 92 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
 93 |           response_models={
 94 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
 95 |           }
 96 |         ),
 97 |         aws_apigateway.MethodResponse(status_code='400'),
 98 |         aws_apigateway.MethodResponse(status_code='500')
 99 |         ])
100 | 
101 |     #XXX: GET /streams/{stream-name}
102 |     # Describe a stream in Kinesis
103 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream
104 |     one_stream_resource = streams_resource.add_resource("{stream-name}")
105 | 
106 |     describe_stream_options = aws_apigateway.IntegrationOptions(
107 |       credentials_role=apigw_kds_role,
108 |       integration_responses=[
109 |         aws_apigateway.IntegrationResponse(
110 |           status_code="200"
111 |         ),
112 |         *apigw_error_responses
113 |       ],
114 |       request_templates={
115 |         'application/json': json.dumps({
116 |             "StreamName": "$input.params('stream-name')"
117 |           }, indent=2)
118 |       },
119 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
120 |     )
121 | 
122 |     describe_stream_integration = aws_apigateway.AwsIntegration(
123 |       service='kinesis',
124 |       action='DescribeStream',
125 |       integration_http_method='POST',
126 |       options=describe_stream_options
127 |     )
128 | 
129 |     one_stream_resource.add_method("GET", describe_stream_integration,
130 |       # Default `authorization_type`: - open access unless `authorizer` is specified
131 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
132 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
133 |           response_models={
134 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
135 |           }
136 |         ),
137 |         aws_apigateway.MethodResponse(status_code='400'),
138 |         aws_apigateway.MethodResponse(status_code='500')
139 |         ])
140 | 
141 |     #XXX: PUT /streams/{stream-name}/record
142 |     # Put a record into a stream in Kinesis
143 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
144 |     record_resource = one_stream_resource.add_resource("record")
145 | 
146 |     put_record_request_mapping_templates = '''
147 | {
148 |   "StreamName": "$input.params('stream-name')",
149 |   "Data": "$util.base64Encode($input.json('$.Data'))",
150 |   "PartitionKey": "$input.path('$.PartitionKey')"
151 | }
152 | '''
153 | 
154 |     put_record_options = aws_apigateway.IntegrationOptions(
155 |       credentials_role=apigw_kds_role,
156 |       integration_responses=[
157 |         aws_apigateway.IntegrationResponse(
158 |           status_code="200"
159 |         ),
160 |         *apigw_error_responses
161 |       ],
162 |       request_templates={
163 |         'application/json': put_record_request_mapping_templates
164 |       },
165 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
166 |     )
167 | 
168 |     put_record_integration = aws_apigateway.AwsIntegration(
169 |       service='kinesis',
170 |       action='PutRecord',
171 |       integration_http_method='POST',
172 |       options=put_record_options
173 |     )
174 | 
175 |     record_resource.add_method("PUT", put_record_integration,
176 |       # Default `authorization_type`: - open access unless `authorizer` is specified
177 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
178 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
179 |           response_models={
180 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
181 |           }
182 |         ),
183 |         aws_apigateway.MethodResponse(status_code='400'),
184 |         aws_apigateway.MethodResponse(status_code='500')
185 |         ])
186 | 
187 | 
188 |     #XXX: PUT /streams/{stream-name}/records
189 |     # Put records into a stream in Kinesis
190 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
191 |     records_resource = one_stream_resource.add_resource("records")
192 | 
193 |     put_records_request_mapping_templates = '''
194 | {
195 |   "StreamName": "$input.params('stream-name')",
196 |   "Records": [
197 |     #foreach($elem in $input.path('$.records'))
198 |       {
199 |         "Data": "$util.base64Encode($elem.data)",
200 |         "PartitionKey": "$elem.partition-key"
201 |       }#if($foreach.hasNext),#end
202 |     #end
203 |   ]
204 | }
205 | '''
206 | 
207 |     put_records_options = aws_apigateway.IntegrationOptions(
208 |       credentials_role=apigw_kds_role,
209 |       integration_responses=[
210 |         aws_apigateway.IntegrationResponse(
211 |           status_code="200"
212 |         ),
213 |         *apigw_error_responses
214 |       ],
215 |       request_templates={
216 |         'application/json': put_records_request_mapping_templates
217 |       },
218 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
219 |     )
220 | 
221 |     put_records_integration = aws_apigateway.AwsIntegration(
222 |       service='kinesis',
223 |       action='PutRecords',
224 |       integration_http_method='POST',
225 |       options=put_records_options
226 |     )
227 | 
228 |     records_resource.add_method("PUT", put_records_integration,
229 |       # Default `authorization_type`: - open access unless `authorizer` is specified
230 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
231 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
232 |           response_models={
233 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
234 |           }
235 |         ),
236 |         aws_apigateway.MethodResponse(status_code='400'),
237 |         aws_apigateway.MethodResponse(status_code='500')
238 |         ])
239 | 
240 |     cdk.CfnOutput(self, 'KdsRestApiName',
241 |       value=kds_proxy_api.rest_api_name,
242 |       export_name=f'{self.stack_name}-KdsProxyRestApiName')


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/firehose_data_proc_lambda.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from aws_cdk import (
10 |   Stack,
11 |   aws_lambda,
12 |   aws_logs,
13 |   aws_s3 as s3
14 | )
15 | from constructs import Construct
16 | 
17 | 
18 | class FirehoseDataProcLambdaStack(Stack):
19 | 
20 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
21 |     super().__init__(scope, construct_id, **kwargs)
22 | 
23 |     firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda')
24 |     LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name']
25 |     LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key']
26 | 
27 |     s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET)
28 |     lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib",
29 |       layer_version_name="fastavro-lib",
30 |       compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11],
31 |       code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY)
32 |     )
33 | 
34 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
35 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
36 |     dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None)
37 |     dest_iceberg_table_unique_keys = ",".join(dest_iceberg_table_unique_keys) if dest_iceberg_table_unique_keys else ""
38 | 
39 |     LAMBDA_FN_NAME = "WebAnalyticsFirehoseToIcebergTransformer"
40 |     self.data_proc_lambda_fn = aws_lambda.Function(self, "FirehoseToIcebergTransformer",
41 |       runtime=aws_lambda.Runtime.PYTHON_3_11,
42 |       function_name=LAMBDA_FN_NAME,
43 |       handler="firehose_to_iceberg_transformer.lambda_handler",
44 |       description="Transform records to Apache Iceberg table",
45 |       code=aws_lambda.Code.from_asset(os.path.join(os.path.dirname(__file__), '../src/main/python/IcebergTransformer')),
46 |       environment={
47 |         "IcebergDatabaseName": dest_iceberg_table_config["database_name"],
48 |         "IcebergTableName": dest_iceberg_table_config["table_name"],
49 |         "IcebergTableUniqueKeys": dest_iceberg_table_unique_keys
50 |       },
51 |       timeout=cdk.Duration.minutes(5),
52 |       #XXX: set memory size appropriately
53 |       memory_size=256,
54 |       layers=[lambda_lib_layer]
55 |     )
56 | 
57 |     log_group = aws_logs.LogGroup(self, "FirehoseToIcebergTransformerLogGroup",
58 |       #XXX: Circular dependency between resources occurs
59 |       # if aws_lambda.Function.function_name is used
60 |       # instead of literal name of lambda function such as "FirehoseToIcebergTransformer"
61 |       log_group_name=f"/aws/lambda/{LAMBDA_FN_NAME}",
62 |       retention=aws_logs.RetentionDays.THREE_DAYS,
63 |       removal_policy=cdk.RemovalPolicy.DESTROY
64 |     )
65 |     log_group.grant_write(self.data_proc_lambda_fn)
66 | 
67 | 
68 |     cdk.CfnOutput(self, 'FirehoseDataProcFuncName',
69 |       value=self.data_proc_lambda_fn.function_name,
70 |       export_name=f'{self.stack_name}-FirehoseDataProcFuncName')


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/firehose_role.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_iam
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | 
 14 | class FirehoseRoleStack(Stack):
 15 | 
 16 |   def __init__(self, scope: Construct, construct_id: str,
 17 |                data_transform_lambda_fn, source_kinesis_stream, s3_bucket,
 18 |                **kwargs) -> None:
 19 | 
 20 |     super().__init__(scope, construct_id, **kwargs)
 21 | 
 22 |     firehose_role_policy_doc = aws_iam.PolicyDocument()
 23 | 
 24 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 25 |       effect=aws_iam.Effect.ALLOW,
 26 |       resources=[source_kinesis_stream.stream_arn],
 27 |       actions=[
 28 |         "kinesis:DescribeStream",
 29 |         "kinesis:GetShardIterator",
 30 |         "kinesis:GetRecords",
 31 |         "kinesis:ListShards"
 32 |       ]
 33 |     ))
 34 | 
 35 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 36 |       "effect": aws_iam.Effect.ALLOW,
 37 |       "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)],
 38 |       "actions": [
 39 |         "s3:AbortMultipartUpload",
 40 |         "s3:GetBucketLocation",
 41 |         "s3:GetObject",
 42 |         "s3:ListBucket",
 43 |         "s3:ListBucketMultipartUploads",
 44 |         "s3:PutObject",
 45 |         "s3:DeleteObject"
 46 |       ]
 47 |     }))
 48 | 
 49 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 50 |       "effect": aws_iam.Effect.ALLOW,
 51 |       "resources": [
 52 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog",
 53 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:database/*",
 54 |         f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:table/*/*"
 55 |       ],
 56 |       "actions": [
 57 |         "glue:GetTable",
 58 |         "glue:GetDatabase",
 59 |         "glue:UpdateTable"
 60 |       ]
 61 |     }))
 62 | 
 63 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 64 |       effect=aws_iam.Effect.ALLOW,
 65 |       resources=["*"],
 66 |       actions=[
 67 |         "ec2:DescribeVpcs",
 68 |         "ec2:DescribeVpcAttribute",
 69 |         "ec2:DescribeSubnets",
 70 |         "ec2:DescribeSecurityGroups",
 71 |         "ec2:DescribeNetworkInterfaces",
 72 |         "ec2:CreateNetworkInterface",
 73 |         "ec2:CreateNetworkInterfacePermission",
 74 |         "ec2:DeleteNetworkInterface"
 75 |       ]
 76 |     ))
 77 | 
 78 |     #XXX: https://docs.aws.amazon.com/ko_kr/cdk/latest/guide/tokens.html
 79 |     # String-encoded tokens:
 80 |     #  Avoid manipulating the string in other ways. For example,
 81 |     #  taking a substring of a string is likely to break the string token.
 82 |     firehose_log_group_name = f"/aws/kinesisfirehose/{source_kinesis_stream.stream_name}"
 83 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 84 |       effect=aws_iam.Effect.ALLOW,
 85 |       #XXX: The ARN will be formatted as follows:
 86 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name}
 87 |       resources=[self.format_arn(service="logs", resource="log-group",
 88 |         resource_name="{}:log-stream:*".format(firehose_log_group_name),
 89 |         arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)],
 90 |       actions=["logs:PutLogEvents"]
 91 |     ))
 92 | 
 93 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 94 |       "effect": aws_iam.Effect.ALLOW,
 95 |       "resources": [f"{data_transform_lambda_fn.function_arn}:*"],
 96 |       "actions": [
 97 |         "lambda:InvokeFunction",
 98 |         "lambda:GetFunctionConfiguration"
 99 |       ]
100 |     }))
101 | 
102 |     self.firehose_role = aws_iam.Role(self, "KinesisFirehoseServiceRole",
103 |       role_name=f"KinesisFirehoseServiceRole-{source_kinesis_stream.stream_name}-{self.region}",
104 |       assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"),
105 |       #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221
106 |       inline_policies={
107 |         "firehose_role_policy": firehose_role_policy_doc
108 |       }
109 |     )
110 | 
111 | 
112 |     cdk.CfnOutput(self, 'FirehoseRole',
113 |       value=self.firehose_role.role_name,
114 |       export_name=f'{self.stack_name}-Role')
115 |     cdk.CfnOutput(self, 'FirehoseRoleArn',
116 |       value=self.firehose_role.role_arn,
117 |       export_name=f'{self.stack_name}-RoleArn')


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/firehose_to_iceberg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_s3 as s3,
 10 |   aws_kinesisfirehose
 11 | )
 12 | from constructs import Construct
 13 | 
 14 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn_delivery_stream
 15 | 
 16 | 
 17 | class FirehoseToIcebergStack(Stack):
 18 | 
 19 |   def __init__(self, scope: Construct, construct_id: str,
 20 |                data_transform_lambda_fn, source_kinesis_stream, s3_bucket,
 21 |                firehose_role, **kwargs) -> None:
 22 | 
 23 |     super().__init__(scope, construct_id, **kwargs)
 24 | 
 25 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
 26 | 
 27 |     firehose_log_group_name = f"/aws/kinesisfirehose/{source_kinesis_stream.stream_name}"
 28 | 
 29 |     firehose_buffering_hints = data_firehose_configuration["buffering_hints"]
 30 |     firehose_buffer_size = firehose_buffering_hints["size_in_mbs"]
 31 |     firehose_buffer_interval = firehose_buffering_hints["interval_in_seconds"]
 32 | 
 33 |     transform_records_with_aws_lambda = data_firehose_configuration["transform_records_with_aws_lambda"]
 34 |     firehose_lambda_buffer_size = transform_records_with_aws_lambda["buffer_size"]
 35 |     firehose_lambda_buffer_interval = transform_records_with_aws_lambda["buffer_interval"]
 36 |     firehose_lambda_number_of_retries = transform_records_with_aws_lambda["number_of_retries"]
 37 | 
 38 |     s3_output_prefix = data_firehose_configuration["output_prefix"]
 39 |     s3_error_output_prefix = data_firehose_configuration["error_output_prefix"]
 40 | 
 41 |     lambda_proc = cfn_delivery_stream.ProcessorProperty(
 42 |       type="Lambda",
 43 |       parameters=[
 44 |         cfn_delivery_stream.ProcessorParameterProperty(
 45 |           parameter_name="LambdaArn",
 46 |           parameter_value='{}:{}'.format(
 47 |             data_transform_lambda_fn.function_arn,
 48 |             data_transform_lambda_fn.latest_version.version
 49 |           )
 50 |         ),
 51 |         cfn_delivery_stream.ProcessorParameterProperty(
 52 |           parameter_name="NumberOfRetries",
 53 |           parameter_value=str(firehose_lambda_number_of_retries)
 54 |         ),
 55 |         cfn_delivery_stream.ProcessorParameterProperty(
 56 |           parameter_name="RoleArn",
 57 |           parameter_value=firehose_role.role_arn
 58 |         ),
 59 |         cfn_delivery_stream.ProcessorParameterProperty(
 60 |           parameter_name="BufferSizeInMBs",
 61 |           parameter_value=str(firehose_lambda_buffer_size)
 62 |         ),
 63 |         cfn_delivery_stream.ProcessorParameterProperty(
 64 |           parameter_name="BufferIntervalInSeconds",
 65 |           parameter_value=str(firehose_lambda_buffer_interval)
 66 |         )
 67 |       ]
 68 |     )
 69 | 
 70 |     firehose_processing_config = cfn_delivery_stream.ProcessingConfigurationProperty(
 71 |       enabled=True,
 72 |       processors=[
 73 |         lambda_proc
 74 |       ]
 75 |     )
 76 | 
 77 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
 78 |     dest_iceberg_table_unique_keys = dest_iceberg_table_config.get("unique_keys", None)
 79 |     dest_iceberg_table_unique_keys = dest_iceberg_table_unique_keys if dest_iceberg_table_unique_keys else None
 80 | 
 81 |     iceberg_dest_config = cfn_delivery_stream.IcebergDestinationConfigurationProperty(
 82 |       catalog_configuration=cfn_delivery_stream.CatalogConfigurationProperty(
 83 |         catalog_arn=f"arn:aws:glue:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:catalog"
 84 |       ),
 85 |       role_arn=firehose_role.role_arn,
 86 |       s3_configuration=cfn_delivery_stream.S3DestinationConfigurationProperty(
 87 |         bucket_arn=s3_bucket.bucket_arn,
 88 |         role_arn=firehose_role.role_arn,
 89 |         buffering_hints={
 90 |           "intervalInSeconds": firehose_buffer_interval,
 91 |           "sizeInMBs": firehose_buffer_size
 92 |         },
 93 |         cloud_watch_logging_options={
 94 |           "enabled": True,
 95 |           "logGroupName": firehose_log_group_name,
 96 |           "logStreamName": "DestinationDelivery"
 97 |         },
 98 |         compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP]
 99 |         error_output_prefix=s3_error_output_prefix,
100 |         prefix=s3_output_prefix,
101 |       ),
102 |       buffering_hints={
103 |         "intervalInSeconds": firehose_buffer_interval,
104 |         "sizeInMBs": firehose_buffer_size
105 |       },
106 |       cloud_watch_logging_options={
107 |         "enabled": True,
108 |         "logGroupName": firehose_log_group_name,
109 |         "logStreamName": "DestinationDelivery"
110 |       },
111 |       destination_table_configuration_list=[
112 |         cfn_delivery_stream.DestinationTableConfigurationProperty(
113 |           destination_database_name=dest_iceberg_table_config["database_name"],
114 |           destination_table_name=dest_iceberg_table_config["table_name"],
115 |           unique_keys=dest_iceberg_table_unique_keys
116 |         )
117 |       ],
118 |       processing_configuration=firehose_processing_config,
119 |       s3_backup_mode='FailedDataOnly'
120 |     )
121 | 
122 |     _ = aws_kinesisfirehose.CfnDeliveryStream(self, "FirehoseToIceberg",
123 |       delivery_stream_name=source_kinesis_stream.stream_name,
124 |       delivery_stream_type="KinesisStreamAsSource",
125 |       kinesis_stream_source_configuration={
126 |         "kinesisStreamArn": source_kinesis_stream.stream_arn,
127 |         "roleArn": firehose_role.role_arn
128 |       },
129 |       iceberg_destination_configuration=iceberg_dest_config,
130 |       tags=[{"key": "Name", "value": source_kinesis_stream.stream_name}]
131 |     )
132 | 
133 | 
134 |     cdk.CfnOutput(self, 'SourceKinesisStreamName',
135 |       value=source_kinesis_stream.stream_name,
136 |       export_name=f'{self.stack_name}-SourceKinesisStreamName')
137 |     cdk.CfnOutput(self, 'S3DestBucket',
138 |       value=s3_bucket.bucket_name,
139 |       export_name=f'{self.stack_name}-S3DestBucket')


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/kds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import random
 6 | import string
 7 | 
 8 | import aws_cdk as cdk
 9 | 
10 | from aws_cdk import (
11 |   Duration,
12 |   Stack,
13 |   aws_kinesis,
14 | )
15 | from constructs import Construct
16 | 
17 | random.seed(31)
18 | 
19 | 
20 | class KdsStack(Stack):
21 | 
22 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
23 |     super().__init__(scope, construct_id, **kwargs)
24 | 
25 |     KINESIS_DEFAULT_STREAM_NAME = 'PUT-Firehose-{}'.format(''.join(random.sample((string.ascii_letters), k=5)))
26 |     KINESIS_STREAM_NAME = self.node.try_get_context('kinesis_stream_name') or KINESIS_DEFAULT_STREAM_NAME
27 | 
28 |     self.kinesis_stream = aws_kinesis.Stream(self, "SourceKinesisStreams",
29 |       retention_period=Duration.hours(24),
30 |       stream_mode=aws_kinesis.StreamMode.ON_DEMAND,
31 |       stream_name=KINESIS_STREAM_NAME)
32 | 
33 |     cdk.CfnOutput(self, 'KinesisDataStreamName',
34 |       value=self.kinesis_stream.stream_name,
35 |       export_name=f'{self.stack_name}-KinesisDataStreamName')
36 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/lake_formation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lakeformation
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class DataLakePermissionsStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, firehose_role, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
20 |     dest_iceberg_table_config = data_firehose_configuration["destination_iceberg_table_configuration"]
21 |     database_name=dest_iceberg_table_config["database_name"]
22 | 
23 |     #XXXX: The role assumed by cdk is not a data lake administrator.
24 |     # So, deploying PrincipalPermissions meets the error such as:
25 |     # "Resource does not exist or requester is not authorized to access requested permissions."
26 |     # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
27 |     # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
28 |     cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
29 |       admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
30 |         data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
31 |       )]
32 |     )
33 | 
34 |     cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions",
35 |       permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
36 |       permissions_with_grant_option=[],
37 |       principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
38 |         data_lake_principal_identifier=firehose_role.role_arn
39 |       ),
40 |       resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
41 |         #XXX: Can't specify a TableWithColumns resource and a Table resource
42 |         table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
43 |           catalog_id=cdk.Aws.ACCOUNT_ID,
44 |           database_name=database_name,
45 |           # name="ALL_TABLES",
46 |           table_wildcard={}
47 |         )
48 |       )
49 |     )
50 |     cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
51 | 
52 |     #XXX: In order to keep resource destruction order,
53 |     # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
54 |     cfn_principal_permissions.add_dependency(cfn_data_lake_settings)
55 | 
56 | 
57 |     cdk.CfnOutput(self, 'Principal',
58 |       value=cfn_principal_permissions.attr_principal_identifier,
59 |       export_name=f'{self.stack_name}-Principal')


--------------------------------------------------------------------------------
/web-analytics-iceberg/cdk_stacks/s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_s3 as s3
10 | )
11 | 
12 | from constructs import Construct
13 | 
14 | 
15 | class S3BucketStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     S3_DEFAULT_BUCKET_NAME = f"web-analytics-{self.region}-{self.account}"
21 |     data_firehose_configuration = self.node.try_get_context("data_firehose_configuration")
22 |     s3_bucket_name = data_firehose_configuration.get('s3_bucket_name', S3_DEFAULT_BUCKET_NAME)
23 | 
24 |     self.s3_bucket = s3.Bucket(self, "s3bucket",
25 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
26 |       bucket_name=s3_bucket_name)
27 | 
28 | 
29 |     cdk.CfnOutput(self, 'S3BucketName',
30 |       value=self.s3_bucket.bucket_name,
31 |       export_name=f'{self.stack_name}-S3BucketName')


--------------------------------------------------------------------------------
/web-analytics-iceberg/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.24.41
2 | mimesis==18.0.0
3 | requests>=2.31.0
4 | 
5 | # packages for Lambda Layer
6 | fastavro==1.10.0
7 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.185.0
2 | constructs>=10.0.0,<11.0.0
3 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/src/main/python/IcebergTransformer/firehose_to_iceberg_transformer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import base64
  6 | import collections
  7 | import json
  8 | import logging
  9 | import os
 10 | from datetime import datetime
 11 | 
 12 | import fastavro
 13 | 
 14 | 
 15 | LOGGER = logging.getLogger()
 16 | if len(LOGGER.handlers) > 0:
 17 |   # The Lambda environment pre-configures a handler logging to stderr.
 18 |   # If a handler is already configured, `.basicConfig` does not execute.
 19 |   # Thus we set the level directly.
 20 |   LOGGER.setLevel(logging.INFO)
 21 | else:
 22 |   logging.basicConfig(level=logging.INFO)
 23 | 
 24 | 
 25 | DESTINATION_DATABASE_NAME = os.environ['IcebergDatabaseName']
 26 | DESTINATION_TABLE_NAME = os.environ['IcebergTableName']
 27 | DESTINATION_TABLE_UNIQUE_KEYS = os.environ.get('IcebergTableUniqueKeys', None)
 28 | 
 29 | ORIGINAL_SCHEMA = {
 30 |   'name': 'WebLogs',
 31 |   'type': 'record',
 32 |   'fields': [
 33 |     {
 34 |       'name': 'user_id',
 35 |       'type': 'string'
 36 |     },
 37 |     {
 38 |       'name': 'session_id',
 39 |       'type': 'string'
 40 |     },
 41 |     {
 42 |       'name': 'event',
 43 |       'type': 'string'
 44 |     },
 45 |     {
 46 |       'name': 'referrer',
 47 |       'type': ['string', 'null']
 48 |     },
 49 |     {
 50 |       'name': 'user_agent',
 51 |       'type': ['string', 'null']
 52 |     },
 53 |     {
 54 |       'name': 'ip',
 55 |       'type': 'string'
 56 |     },
 57 |     {
 58 |       'name': 'hostname',
 59 |       'type': 'string'
 60 |     },
 61 |     {
 62 |       'name': 'os',
 63 |       'type': ['string', 'null']
 64 |     },
 65 |     {
 66 |       'name': 'timestamp',
 67 |       'type': {
 68 |         'type': 'string',
 69 |         'logicalType': 'datetime'
 70 |       }
 71 |     },
 72 |     {
 73 |       'name': 'uri',
 74 |       'type': 'string'
 75 |     }
 76 |   ]
 77 | }
 78 | 
 79 | 
 80 | def read_datetime(data, writer_schema=None, reader_schema=None):
 81 |   return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 82 | 
 83 | 
 84 | def prepare_datetime(data, schema):
 85 |   """Converts datetime.datetime to string representing the date and time"""
 86 |   if isinstance(data, datetime):
 87 |     return datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
 88 |   else:
 89 |     try:
 90 |       dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 91 |       return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
 92 |     except Exception as ex:
 93 |       return None
 94 | 
 95 | 
 96 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime
 97 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime
 98 | 
 99 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA)
100 | 
101 | 
102 | def check_schema(record):
103 |   try:
104 |     return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False)
105 |   except Exception as ex:
106 |     LOGGER.error(ex)
107 |     return False
108 | 
109 | 
110 | def lambda_handler(event, context):
111 |   counter = collections.Counter(total=0, valid=0, invalid=0)
112 |   firehose_records_output = {'records': []}
113 | 
114 |   unique_keys_exist = True if DESTINATION_TABLE_UNIQUE_KEYS else False
115 |   otf_metadata_operation = 'insert' if not unique_keys_exist else 'update'
116 | 
117 |   for record in event['records']:
118 |     counter['total'] += 1
119 | 
120 |     payload = base64.b64decode(record['data']).decode('utf-8')
121 |     json_value = json.loads(payload)
122 | 
123 |     #XXX: check if schema is valid
124 |     is_valid = check_schema(json_value)
125 |     counter['valid' if is_valid else 'invalid'] += 1
126 | 
127 |     firehose_record = {
128 |       'data': base64.b64encode(payload.encode('utf-8')),
129 |       'recordId': record['recordId'],
130 |       'result': 'Ok' if is_valid else 'ProcessingFailed', # [Ok, Dropped, ProcessingFailed]
131 |       'metadata': {
132 |         'otfMetadata': {
133 |           'destinationDatabaseName': DESTINATION_DATABASE_NAME,
134 |           'destinationTableName': DESTINATION_TABLE_NAME,
135 |           'operation': otf_metadata_operation
136 |         }
137 |       }
138 |     }
139 | 
140 |     firehose_records_output['records'].append(firehose_record)
141 | 
142 |   LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items()))
143 | 
144 |   return firehose_records_output
145 | 
146 | 
147 | if __name__ == '__main__':
148 |   import pprint
149 | 
150 |   record_list = [
151 |     ('Ok', {
152 |       "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720",
153 |       "session_id": "a5aa20a72c9e37588f9bbeaa",
154 |       "event": "view",
155 |       "referrer": "brandon.biz",
156 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
157 |       "ip": "202.165.71.49",
158 |       "hostname": "toxic.tokyo",
159 |       "os": "openSUSE",
160 |       "timestamp": "2022-09-16T07:35:46Z",
161 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
162 |     }),
163 |     ('Ok', {
164 |       "user_id": "70b1f606-aa63-47fb-bc92-76de9c59d064",
165 |       "session_id": "928e78473db8449b17644b2c",
166 |       "event": "like",
167 |       # missing optional data
168 |       # "referrer": "toe.gq",
169 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53",
170 |       "ip": "12.166.113.176",
171 |       "hostname": "drivers.glass",
172 |       "os": "Windows 8.1",
173 |       "timestamp": "2022-09-16T07:52:47Z",
174 |       "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members"
175 |     }),
176 |     ('ProcessingFailed', {
177 |       "user_id": "897bef5f-294d-4ecc-a3b6-ef2844958720",
178 |       "session_id": "a5aa20a72c9e37588f9bbeaa",
179 |       "event": "cart",
180 |       "referrer": "brandon.biz",
181 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
182 |       "ip": "202.165.71.49",
183 |       "hostname": "toxic.tokyo",
184 |       "os": "openSUSE",
185 |       # invalid datetime format
186 |       "timestamp": "2022-09-16 07:35:46",
187 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
188 |     }),
189 |     ('ProcessingFailed', {
190 |       # missing required data
191 |       # "user_id": "045e63c7-b276-4117-9706-7c2e3b87d5f5",
192 |       "session_id": "abfd47eb7dd7b8aeec0555a7",
193 |       "event": "purchase",
194 |       "referrer": "transfer.edu",
195 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50",
196 |       "ip": "170.128.148.234",
197 |       "hostname": "propecia.tc",
198 |       "os": "Lubuntu",
199 |       "timestamp": "2022-09-16T07:46:04Z",
200 |       "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel"
201 |     }),
202 |     ('ProcessingFailed', {
203 |       "user_id": "e504cd9d-30da-497f-8f28-2b3f64220e16",
204 |       "session_id": "fd4807ab825ee8bd950b1e8b",
205 |       "event": "list",
206 |       "referrer": "liquid.aquitaine",
207 |       "user_agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02",
208 |       # mismatched data type
209 |       "ip": 212234672,
210 |       "hostname": "consequently.com",
211 |       "os": "Gentoo",
212 |       "timestamp": "2022-09-16T07:13:29Z",
213 |       "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook"
214 |     }),
215 |     ('ProcessingFailed', {
216 |       # mismatched column name
217 |       "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720",
218 |       # mismatched column name
219 |       "sessionId": "a5aa20a72c9e37588f9bbeaa",
220 |       "event": "visit",
221 |       "referrer": "brandon.biz",
222 |       # mismatched column name
223 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
224 |       "ip": "202.165.71.49",
225 |       "hostname": "toxic.tokyo",
226 |       "os": "openSUSE",
227 |       "timestamp": "2022-09-16T07:35:46Z",
228 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
229 |     })
230 |   ]
231 | 
232 |   for correct_result, record in record_list:
233 |     event = {
234 |       "invocationId": "invocationIdExample",
235 |       "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE",
236 |       "region": "us-east-1",
237 |       "records": [
238 |         {
239 |           "recordId": "49546986683135544286507457936321625675700192471156785154",
240 |           "approximateArrivalTimestamp": 1495072949453,
241 |           "data": base64.b64encode(json.dumps(record).encode('utf-8'))
242 |         }
243 |       ]
244 |     }
245 | 
246 |     res = lambda_handler(event, {})
247 |     print(f"\n>> {correct_result} == {res['records'][0]['result']}?",  res['records'][0]['result'] == correct_result)
248 |     pprint.pprint(res)
249 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/src/utils/gen_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import sys
 6 | import argparse
 7 | from datetime import (
 8 |   datetime,
 9 |   timezone
10 | )
11 | import json
12 | import time
13 | import typing
14 | 
15 | from mimesis.locales import Locale
16 | from mimesis.schema import Field, Schema
17 | from mimesis.providers.base import BaseProvider
18 | import requests
19 | 
20 | 
21 | class CustomDatetime(BaseProvider):
22 |   class Meta:
23 |     """Class for metadata."""
24 |     name: typing.Final[str] = "custom_datetime"
25 | 
26 |   def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None:
27 |     super().__init__(*args, **kwargs)
28 | 
29 |   def timestamp(self) -> str:
30 |     utc_now = datetime.now(timezone.utc)
31 |     minute = self.random.randint(0, 59)
32 |     second = self.random.randint(0, 59)
33 |     random_datetime = utc_now.replace(minute=minute, second=second)
34 |     return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
35 | 
36 | 
37 | def main():
38 |   parser = argparse.ArgumentParser()
39 | 
40 |   parser.add_argument('--api-url', help='log collector api url')
41 |   parser.add_argument('--api-method', default='records', choices=['record', 'records'],
42 |     help='log collector api method [record | records]')
43 |   parser.add_argument('--stream-name', help='kinesis stream name')
44 |   parser.add_argument('--max-count', default=15, type=int, help='max number of records to put')
45 |   parser.add_argument('--dry-run', action='store_true')
46 | 
47 |   options = parser.parse_args()
48 | 
49 |   _field = Field(locale=Locale.EN)
50 |   _field._generic.add_provider(CustomDatetime)
51 | 
52 |   schema_definition = lambda: {
53 |     "user_id": _field("uuid"),
54 |     "session_id": _field("token_hex", entropy=12),
55 |     "event": _field("choice", items=['visit', 'view', 'list', 'like', 'cart', 'purchase']),
56 |     "referrer": _field("internet.hostname"),
57 |     "user_agent": _field("internet.user_agent"),
58 |     "ip": _field("internet.ip_v4"),
59 |     "hostname": _field("internet.hostname"),
60 |     "os": _field("development.os"),
61 |     "timestamp": _field("custom_datetime.timestamp"),
62 |     "uri": _field("internet.uri", query_params_count=2)
63 |   }
64 |   schema = Schema(schema=schema_definition, iterations=options.max_count)
65 | 
66 |   log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None
67 | 
68 |   for record in schema:
69 |     if options.dry_run:
70 |       print(json.dumps(record), file=sys.stderr)
71 |       continue
72 | 
73 |     partition_key = record['user_id']
74 |     if options.api_method == 'record':
75 |       data = {'Data': record, 'PartitionKey': partition_key}
76 |       payload = f'{json.dumps(data)}'
77 |     else:
78 |       #XXX: make sure data has newline
79 |       data = {"records":[{'data': f'{json.dumps(record)}\n', 'partition-key': partition_key}]}
80 |       payload = json.dumps(data)
81 | 
82 |     res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'})
83 |     if res.status_code == 200:
84 |       print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr)
85 |     else:
86 |       print(f'[{res.status_code} {res.reason}]', file=sys.stderr)
87 |       sys.exit(1)
88 |     time.sleep(0.5)
89 | 
90 | if __name__ == '__main__':
91 |   main()
92 | 


--------------------------------------------------------------------------------
/web-analytics-iceberg/src/utils/kds_consumer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import argparse
 6 | import pprint
 7 | import random
 8 | import time
 9 | 
10 | import boto3
11 | 
12 | random.seed(47)
13 | 
14 | SHARD_ITER_TYPE = ('TRIM_HORIZON', 'LATEST')
15 | 
16 | def main():
17 |   parser = argparse.ArgumentParser()
18 | 
19 |   parser.add_argument('--stream-name', action="store", help='kinesis stream name')
20 |   parser.add_argument('--shard-id', action="store", help='kinesis stream shard-id')
21 |   parser.add_argument('--iter-type', choices=SHARD_ITER_TYPE, default='LATEST',
22 |     help='kinesis stream shard iterator type: [{}]'.format(', '.join(SHARD_ITER_TYPE)))
23 |   parser.add_argument('--region-name', action='store', default='us-east-1',
24 |     help='aws region name (default: us-east-1)')
25 | 
26 |   options = parser.parse_args()
27 | 
28 |   stream_name, shard_iter_type = options.stream_name, options.iter_type
29 | 
30 |   kinesis_client = boto3.client('kinesis', region_name=options.region_name)
31 |   response = kinesis_client.describe_stream(StreamName=stream_name)
32 |   if options.shard_id:
33 |     shard_id = options.shard_id
34 |   else:
35 |     shard_id_list = [e['ShardId'] for e in response['StreamDescription']['Shards']]
36 |     shard_id = random.choice(shard_id_list)
37 | 
38 |   shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name,
39 |                                                      ShardId=shard_id,
40 |                                                      ShardIteratorType=shard_iter_type)
41 | 
42 |   shard_iter = shard_iterator['ShardIterator']
43 |   record_response = kinesis_client.get_records(ShardIterator=shard_iter, Limit=123)
44 |   pprint.pprint(record_response.get('Records', []), indent=2)
45 | 
46 |   while 'NextShardIterator' in record_response:
47 |     record_response = kinesis_client.get_records(ShardIterator=record_response['NextShardIterator'], Limit=123)
48 |     pprint.pprint(record_response.get('Records', []), indent=2)
49 | 
50 |     # wait for a few seconds
51 |     time.sleep(5)
52 | 
53 | if __name__ == '__main__':
54 |   main()
55 | 
56 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/.example.cdk.context.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "acknowledged-issue-numbers": [
 3 |     32775
 4 |   ],
 5 |   "firehose_data_tranform_lambda": {
 6 |     "s3_bucket_name": "s3-bucket-name-for-lambda-layer-resources",
 7 |     "s3_object_key": "var/fastavro-lib.zip"
 8 |   },
 9 |   "firehose": {
10 |     "buffer_size_in_mbs": 128,
11 |     "buffer_interval_in_seconds": 300,
12 |     "lambda_buffer_size_in_mbs": 3,
13 |     "lambda_buffer_interval_in_seconds": 300,
14 |     "lambda_number_of_retries": 3,
15 |     "s3_output_folder": "json-data",
16 |     "prefix": "json-data/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/",
17 |     "error_output_prefix": "error/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/!{firehose:error-output-type}"
18 |   },
19 |   "merge_small_files_lambda_env": {
20 |     "OLD_DATABASE": "mydatabase",
21 |     "OLD_TABLE_NAME": "web_log_json",
22 |     "NEW_DATABASE": "mydatabase",
23 |     "NEW_TABLE_NAME": "web_log_parquet",
24 |     "NEW_TABLE_S3_FOLDER_NAME": "parquet-data",
25 |     "COLUMN_NAMES": "userId,sessionId,referrer,userAgent,ip,hostname,os,timestamp,uri"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | package-lock.json
 4 | __pycache__
 5 | .pytest_cache
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import os
 6 | 
 7 | import aws_cdk as cdk
 8 | 
 9 | from cdk_stacks import (
10 |   KdsProxyApiGwStack,
11 |   KdsStack,
12 |   FirehoseDataTransformLambdaStack,
13 |   FirehoseStack,
14 |   MergeSmallFilesLambdaStack,
15 |   AthenaWorkGroupStack,
16 |   AthenaNamedQueryStack,
17 |   GlueCatalogDatabaseStack,
18 |   DataLakePermissionsStack
19 | )
20 | 
21 | AWS_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
22 |   region=os.getenv('CDK_DEFAULT_REGION'))
23 | 
24 | app = cdk.App()
25 | 
26 | kds_proxy_apigw = KdsProxyApiGwStack(app, 'WebAnalyticsKdsProxyApiGw')
27 | kds_stack = KdsStack(app, 'WebAnalyticsKinesisStream')
28 | 
29 | firehose_data_transform_lambda = FirehoseDataTransformLambdaStack(app,
30 |   'WebAnalyticsFirehoseDataTransformLambda')
31 | firehose_data_transform_lambda.add_dependency(kds_stack)
32 | 
33 | firehose_stack = FirehoseStack(app, 'WebAnalyticsFirehose',
34 |   kds_stack.target_kinesis_stream.stream_arn,
35 |   firehose_data_transform_lambda.schema_validator_lambda_fn)
36 | firehose_stack.add_dependency(firehose_data_transform_lambda)
37 | 
38 | athena_work_group_stack = AthenaWorkGroupStack(app,
39 |   'WebAnalyticsAthenaWorkGroup'
40 | )
41 | athena_work_group_stack.add_dependency(firehose_stack)
42 | 
43 | merge_small_files_stack = MergeSmallFilesLambdaStack(app,
44 |   'WebAnalyticsMergeSmallFiles',
45 |   firehose_stack.s3_dest_bucket_name,
46 |   firehose_stack.s3_dest_folder_name,
47 |   athena_work_group_stack.athena_work_group_name
48 | )
49 | merge_small_files_stack.add_dependency(athena_work_group_stack)
50 | 
51 | athena_databases = GlueCatalogDatabaseStack(app, 'WebAnalyticsGlueDatabases')
52 | athena_databases.add_dependency(merge_small_files_stack)
53 | 
54 | lakeformation_grant_permissions = DataLakePermissionsStack(app, 'WebAnalyticsGrantLFPermissionsOnMergeFilesJob',
55 |   merge_small_files_stack.lambda_exec_role
56 | )
57 | lakeformation_grant_permissions.add_dependency(athena_databases)
58 | 
59 | athena_named_query_stack = AthenaNamedQueryStack(app,
60 |   'WebAnalyticsAthenaNamedQueries',
61 |   athena_work_group_stack.athena_work_group_name,
62 |   merge_small_files_stack.s3_json_location,
63 |   merge_small_files_stack.s3_parquet_location
64 | )
65 | athena_named_query_stack.add_dependency(lakeformation_grant_permissions)
66 | 
67 | app.synth()
68 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/assets/amazon-athena-switching-to-workgroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-parquet/assets/amazon-athena-switching-to-workgroup.png


--------------------------------------------------------------------------------
/web-analytics-parquet/assets/data-lake-formation-permissions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/web-analytics-on-aws/ae85ae98478a174c9e45292a98f11dfef6574814/web-analytics-parquet/assets/data-lake-formation-permissions.png


--------------------------------------------------------------------------------
/web-analytics-parquet/build-aws-lambda-layer-package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -
 2 | 
 3 | LAMBDA_LAYER_NAME=fastavro-lib
 4 | S3_PATH=$1
 5 | 
 6 | docker run -v "$PWD":/var/task "public.ecr.aws/sam/build-python3.9" /bin/sh -c "pip install fastavro==1.6.1 -t python/lib/python3.9/site-packages/; exit"
 7 | 
 8 | zip -q -r ${LAMBDA_LAYER_NAME}.zip python >/dev/null
 9 | aws s3 cp --quiet ${LAMBDA_LAYER_NAME}.zip s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip
10 | echo "[Lambda_Layer_Code_S3_Path] s3://${S3_PATH}/${LAMBDA_LAYER_NAME}.zip"
11 | 
12 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ]
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .apigw import KdsProxyApiGwStack
 2 | from .athena_workgroup import AthenaWorkGroupStack
 3 | from .athena_named_query import AthenaNamedQueryStack
 4 | from .firehose_dtata_transform_lambda import FirehoseDataTransformLambdaStack
 5 | from .firehose import FirehoseStack
 6 | from .kds import KdsStack
 7 | from .merge_small_files_lambda import MergeSmallFilesLambdaStack
 8 | from .vpc import VpcStack
 9 | from .glue_catalog_database import GlueCatalogDatabaseStack
10 | from .lake_formation import DataLakePermissionsStack
11 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/apigw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import json
  6 | 
  7 | import aws_cdk as cdk
  8 | 
  9 | from aws_cdk import (
 10 |   Stack,
 11 |   aws_apigateway,
 12 |   aws_iam,
 13 | )
 14 | from constructs import Construct
 15 | 
 16 | 
 17 | class KdsProxyApiGwStack(Stack):
 18 | 
 19 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
 20 |     super().__init__(scope, construct_id, **kwargs)
 21 | 
 22 |     apigw_kds_access_role_policy_doc = aws_iam.PolicyDocument()
 23 |     apigw_kds_access_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 24 |       "effect": aws_iam.Effect.ALLOW,
 25 |       "resources": ["*"],
 26 |       "actions": [
 27 |         "kinesis:DescribeStream",
 28 |         "kinesis:PutRecord",
 29 |         "kinesis:PutRecords"]
 30 |     }))
 31 | 
 32 |     apigw_kds_role = aws_iam.Role(self, "APIGatewayRoleToAccessKinesisDataStreams",
 33 |       role_name='APIGatewayRoleToAccessKinesisDataStreams',
 34 |       assumed_by=aws_iam.ServicePrincipal('apigateway.amazonaws.com'),
 35 |       inline_policies={
 36 |         'KinesisWriteAccess': apigw_kds_access_role_policy_doc
 37 |       },
 38 |       managed_policies=[
 39 |         aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonKinesisReadOnlyAccess')
 40 |       ]
 41 |     )
 42 | 
 43 |     #XXX: Start to create an API as a Kinesis proxy
 44 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-api-as-kinesis-proxy
 45 |     kds_proxy_api = aws_apigateway.RestApi(self, "KdsProxyAPI",
 46 |       rest_api_name="log-collector",
 47 |       description="An Amazon API Gateway REST API that integrated with an Amazon Kinesis Data Streams.",
 48 |       endpoint_types=[aws_apigateway.EndpointType.REGIONAL],
 49 |       default_cors_preflight_options={
 50 |         "allow_origins": aws_apigateway.Cors.ALL_ORIGINS
 51 |       },
 52 |       deploy=True,
 53 |       deploy_options=aws_apigateway.StageOptions(stage_name="v1"),
 54 |       endpoint_export_name="KdsProxyAPIEndpoint"
 55 |     )
 56 | 
 57 |     apigw_error_responses = [
 58 |       aws_apigateway.IntegrationResponse(status_code="400", selection_pattern="4\\d{2}"),
 59 |       aws_apigateway.IntegrationResponse(status_code="500", selection_pattern="5\\d{2}")
 60 |     ]
 61 | 
 62 |     #XXX: GET /streams
 63 |     # List Kinesis streams by using the API Gateway console
 64 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-list-kinesis-streams
 65 | 
 66 |     streams_resource = kds_proxy_api.root.add_resource("streams")
 67 | 
 68 |     list_streams_options = aws_apigateway.IntegrationOptions(
 69 |       credentials_role=apigw_kds_role,
 70 |       integration_responses=[
 71 |         aws_apigateway.IntegrationResponse(
 72 |           status_code="200"
 73 |         ),
 74 |         *apigw_error_responses
 75 |       ],
 76 |       request_templates={
 77 |         'application/json': '{}'
 78 |       },
 79 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
 80 |     )
 81 | 
 82 |     list_streams_integration = aws_apigateway.AwsIntegration(
 83 |       service='kinesis',
 84 |       action='ListStreams',
 85 |       integration_http_method='POST',
 86 |       options=list_streams_options
 87 |     )
 88 | 
 89 |     streams_resource.add_method("GET", list_streams_integration,
 90 |       # Default `authorization_type`: - open access unless `authorizer` is specified
 91 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
 92 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
 93 |           response_models={
 94 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
 95 |           }
 96 |         ),
 97 |         aws_apigateway.MethodResponse(status_code='400'),
 98 |         aws_apigateway.MethodResponse(status_code='500')
 99 |         ])
100 | 
101 |     #XXX: GET /streams/{stream-name}
102 |     # Describe a stream in Kinesis
103 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-create-describe-delete-stream
104 |     one_stream_resource = streams_resource.add_resource("{stream-name}")
105 | 
106 |     describe_stream_options = aws_apigateway.IntegrationOptions(
107 |       credentials_role=apigw_kds_role,
108 |       integration_responses=[
109 |         aws_apigateway.IntegrationResponse(
110 |           status_code="200"
111 |         ),
112 |         *apigw_error_responses
113 |       ],
114 |       request_templates={
115 |         'application/json': json.dumps({
116 |             "StreamName": "$input.params('stream-name')"
117 |           }, indent=2)
118 |       },
119 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
120 |     )
121 | 
122 |     describe_stream_integration = aws_apigateway.AwsIntegration(
123 |       service='kinesis',
124 |       action='DescribeStream',
125 |       integration_http_method='POST',
126 |       options=describe_stream_options
127 |     )
128 | 
129 |     one_stream_resource.add_method("GET", describe_stream_integration,
130 |       # Default `authorization_type`: - open access unless `authorizer` is specified
131 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
132 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
133 |           response_models={
134 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
135 |           }
136 |         ),
137 |         aws_apigateway.MethodResponse(status_code='400'),
138 |         aws_apigateway.MethodResponse(status_code='500')
139 |         ])
140 | 
141 |     #XXX: PUT /streams/{stream-name}/record
142 |     # Put a record into a stream in Kinesis
143 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
144 |     record_resource = one_stream_resource.add_resource("record")
145 | 
146 |     put_record_request_mapping_templates = '''
147 | {
148 |   "StreamName": "$input.params('stream-name')",
149 |   "Data": "$util.base64Encode($input.json('$.Data'))",
150 |   "PartitionKey": "$input.path('$.PartitionKey')"
151 | }
152 | '''
153 | 
154 |     put_record_options = aws_apigateway.IntegrationOptions(
155 |       credentials_role=apigw_kds_role,
156 |       integration_responses=[
157 |         aws_apigateway.IntegrationResponse(
158 |           status_code="200"
159 |         ),
160 |         *apigw_error_responses
161 |       ],
162 |       request_templates={
163 |         'application/json': put_record_request_mapping_templates
164 |       },
165 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
166 |     )
167 | 
168 |     put_record_integration = aws_apigateway.AwsIntegration(
169 |       service='kinesis',
170 |       action='PutRecord',
171 |       integration_http_method='POST',
172 |       options=put_record_options
173 |     )
174 | 
175 |     record_resource.add_method("PUT", put_record_integration,
176 |       # Default `authorization_type`: - open access unless `authorizer` is specified
177 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
178 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
179 |           response_models={
180 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
181 |           }
182 |         ),
183 |         aws_apigateway.MethodResponse(status_code='400'),
184 |         aws_apigateway.MethodResponse(status_code='500')
185 |         ])
186 | 
187 | 
188 |     #XXX: PUT /streams/{stream-name}/records
189 |     # Put records into a stream in Kinesis
190 |     # https://docs.aws.amazon.com/apigateway/latest/developerguide/integrating-api-with-aws-services-kinesis.html#api-gateway-get-and-add-records-to-stream
191 |     records_resource = one_stream_resource.add_resource("records")
192 | 
193 |     put_records_request_mapping_templates = '''
194 | {
195 |   "StreamName": "$input.params('stream-name')",
196 |   "Records": [
197 |     #foreach($elem in $input.path('$.records'))
198 |       {
199 |         "Data": "$util.base64Encode($elem.data)",
200 |         "PartitionKey": "$elem.partition-key"
201 |       }#if($foreach.hasNext),#end
202 |     #end
203 |   ]
204 | }
205 | '''
206 | 
207 |     put_records_options = aws_apigateway.IntegrationOptions(
208 |       credentials_role=apigw_kds_role,
209 |       integration_responses=[
210 |         aws_apigateway.IntegrationResponse(
211 |           status_code="200"
212 |         ),
213 |         *apigw_error_responses
214 |       ],
215 |       request_templates={
216 |         'application/json': put_records_request_mapping_templates
217 |       },
218 |       passthrough_behavior=aws_apigateway.PassthroughBehavior.WHEN_NO_TEMPLATES
219 |     )
220 | 
221 |     put_records_integration = aws_apigateway.AwsIntegration(
222 |       service='kinesis',
223 |       action='PutRecords',
224 |       integration_http_method='POST',
225 |       options=put_records_options
226 |     )
227 | 
228 |     records_resource.add_method("PUT", put_records_integration,
229 |       # Default `authorization_type`: - open access unless `authorizer` is specified
230 |       authorization_type=aws_apigateway.AuthorizationType.NONE,
231 |       method_responses=[aws_apigateway.MethodResponse(status_code='200',
232 |           response_models={
233 |             'application/json': aws_apigateway.Model.EMPTY_MODEL
234 |           }
235 |         ),
236 |         aws_apigateway.MethodResponse(status_code='400'),
237 |         aws_apigateway.MethodResponse(status_code='500')
238 |         ])
239 | 
240 |     cdk.CfnOutput(self, 'KdsRestApiName',
241 |       value=kds_proxy_api.rest_api_name,
242 |       export_name=f'{self.stack_name}-KdsProxyRestApiName')
243 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/athena_named_query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import aws_cdk as cdk
  6 | 
  7 | from aws_cdk import (
  8 |   Stack,
  9 |   aws_athena
 10 | )
 11 | from constructs import Construct
 12 | 
 13 | 
 14 | class AthenaNamedQueryStack(Stack):
 15 | 
 16 |   def __init__(self, scope: Construct, construct_id: str, athena_work_group_name, s3_json_location, s3_parquet_location, **kwargs) -> None:
 17 |     super().__init__(scope, construct_id, **kwargs)
 18 | 
 19 |     query_for_json_table = '''/* Create your database */
 20 | CREATE DATABASE IF NOT EXISTS mydatabase;
 21 | 
 22 | /* Create table with partitions */
 23 | CREATE EXTERNAL TABLE `mydatabase.web_log_json`(
 24 |   `userId` string,
 25 |   `sessionId` string,
 26 |   `referrer` string,
 27 |   `userAgent` string,
 28 |   `ip` string,
 29 |   `hostname` string,
 30 |   `os` string,
 31 |   `timestamp` timestamp,
 32 |   `uri` string)
 33 | PARTITIONED BY (
 34 |   `year` int,
 35 |   `month` int,
 36 |   `day` int,
 37 |   `hour` int)
 38 | ROW FORMAT SERDE
 39 |   'org.openx.data.jsonserde.JsonSerDe'
 40 | STORED AS INPUTFORMAT
 41 |   'org.apache.hadoop.mapred.TextInputFormat'
 42 | OUTPUTFORMAT
 43 |   'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
 44 | LOCATION
 45 |   '{s3_location}';
 46 | 
 47 | /* Next we will load the partitions for this table */
 48 | MSCK REPAIR TABLE mydatabase.web_log_json;
 49 | 
 50 | /* Check the partitions */
 51 | SHOW PARTITIONS mydatabase.web_log_json;
 52 | 
 53 | SELECT COUNT(*) FROM mydatabase.web_log_json;
 54 | '''.format(s3_location=s3_json_location)
 55 | 
 56 |     named_query_for_json_table = aws_athena.CfnNamedQuery(self, "MyAthenaCfnNamedQuery1",
 57 |       database="default",
 58 |       query_string=query_for_json_table,
 59 | 
 60 |       # the properties below are optional
 61 |       description="Sample Hive DDL statement to create a partitioned table pointing to web log data (json)",
 62 |       name="Create Web Log table (json) with partitions",
 63 |       work_group=athena_work_group_name
 64 |     )
 65 | 
 66 |     query_for_parquet_table = '''/* Create your database */
 67 | CREATE DATABASE IF NOT EXISTS mydatabase;
 68 | 
 69 | /* Create table with partitions */
 70 | CREATE EXTERNAL TABLE `mydatabase.web_log_parquet`(
 71 |   `userId` string,
 72 |   `sessionId` string,
 73 |   `referrer` string,
 74 |   `userAgent` string,
 75 |   `ip` string,
 76 |   `hostname` string,
 77 |   `os` string,
 78 |   `timestamp` timestamp,
 79 |   `uri` string)
 80 | PARTITIONED BY (
 81 |   `year` int,
 82 |   `month` int,
 83 |   `day` int,
 84 |   `hour` int)
 85 | ROW FORMAT SERDE
 86 |   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
 87 | STORED AS INPUTFORMAT
 88 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
 89 | OUTPUTFORMAT
 90 |   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
 91 | LOCATION
 92 |   '{s3_location}';
 93 | 
 94 | /* Next we will load the partitions for this table */
 95 | MSCK REPAIR TABLE mydatabase.web_log_parquet;
 96 | 
 97 | /* Check the partitions */
 98 | SHOW PARTITIONS mydatabase.web_log_parquet;
 99 | 
100 | SELECT COUNT(*) FROM mydatabase.web_log_parquet;
101 | '''.format(s3_location=s3_parquet_location)
102 | 
103 |     named_query_for_parquet_table = aws_athena.CfnNamedQuery(self, "MyAthenaCfnNamedQuery2",
104 |       database="default",
105 |       query_string=query_for_parquet_table,
106 | 
107 |       # the properties below are optional
108 |       description="Sample Hive DDL statement to create a partitioned table pointing to web log data (parquet)",
109 |       name="Create Web Log table (parquet) with partitions",
110 |       work_group=athena_work_group_name
111 |     )
112 | 
113 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/athena_workgroup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_athena,
10 |   aws_s3 as s3,
11 | )
12 | from constructs import Construct
13 | 
14 | 
15 | class AthenaWorkGroupStack(Stack):
16 | 
17 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
18 |     super().__init__(scope, construct_id, **kwargs)
19 | 
20 |     ATHENA_WORK_GROUP_NAME = self.node.try_get_context('athena_workgroup_name') or 'WebAnalyticsGroup'
21 | 
22 |     S3_DEFAULT_BUCKET_NAME = 'aws-athena-query-results-{region}-{account_id}'.format(
23 |         region=cdk.Aws.REGION, account_id=cdk.Aws.ACCOUNT_ID)
24 |     s3_bucket = s3.Bucket(self, "s3bucket",
25 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: core.RemovalPolicy.RETAIN - The bucket will be orphaned
26 |       bucket_name=S3_DEFAULT_BUCKET_NAME)
27 | 
28 |     athena_cfn_work_group = aws_athena.CfnWorkGroup(self, 'AthenaCfnWorkGroup',
29 |       name=ATHENA_WORK_GROUP_NAME,
30 | 
31 |       # the properties below are optional
32 |       description='workgroup for developer',
33 |       recursive_delete_option=False,
34 |       state='ENABLED', # [DISABLED, ENABLED]
35 |       tags=[cdk.CfnTag(
36 |         key='Name',
37 |         value=ATHENA_WORK_GROUP_NAME
38 |       )],
39 |       work_group_configuration=aws_athena.CfnWorkGroup.WorkGroupConfigurationProperty(
40 |         #XXX: EnforceWorkGroupConfiguration
41 |         # Link: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-athena-workgroup-workgroupconfiguration.html#cfn-athena-workgroup-workgroupconfiguration-enforceworkgroupconfiguration
42 |         # If set to "true", the settings for the workgroup override client-side settings.
43 |         # If set to "false", client-side settings are used.
44 |         enforce_work_group_configuration=False,
45 |         engine_version=aws_athena.CfnWorkGroup.EngineVersionProperty(
46 |           effective_engine_version='Athena engine version 3',
47 |           selected_engine_version='Athena engine version 3'
48 |         ),
49 |         publish_cloud_watch_metrics_enabled=True,
50 |         requester_pays_enabled=True,
51 |         result_configuration=aws_athena.CfnWorkGroup.ResultConfigurationProperty(
52 |           output_location=s3_bucket.s3_url_for_object()
53 |         )
54 |       )
55 |     )
56 | 
57 |     self.athena_work_group_name = athena_cfn_work_group.name
58 | 
59 |     cdk.CfnOutput(self, 'AthenaWorkGroupName',
60 |       value=self.athena_work_group_name,
61 |       export_name=f'{self.stack_name}-AthenaWorkGroupName')
62 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/firehose.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import random
  6 | import string
  7 | 
  8 | import aws_cdk as cdk
  9 | 
 10 | from aws_cdk import (
 11 |   Stack,
 12 |   aws_iam,
 13 |   aws_s3 as s3,
 14 |   aws_kinesisfirehose,
 15 | )
 16 | 
 17 | from constructs import Construct
 18 | from aws_cdk.aws_kinesisfirehose import CfnDeliveryStream as cfn
 19 | 
 20 | random.seed(31)
 21 | 
 22 | 
 23 | class FirehoseStack(Stack):
 24 | 
 25 |   def __init__(self, scope: Construct, construct_id: str, source_kinesis_stream_arn, data_transform_lambda_fn, **kwargs) -> None:
 26 |     super().__init__(scope, construct_id, **kwargs)
 27 | 
 28 |     s3_bucket = s3.Bucket(self, "s3bucket",
 29 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
 30 |       bucket_name="web-analytics-{region}-{account_id}".format(
 31 |         region=cdk.Aws.REGION, account_id=cdk.Aws.ACCOUNT_ID))
 32 | 
 33 |     FIREHOSE_DEFAULT_STREAM_NAME = 'PUT-S3-{}'.format(''.join(random.sample((string.ascii_letters), k=5)))
 34 |     firehose_config = self.node.try_get_context('firehose')
 35 | 
 36 |     FIREHOSE_STREAM_NAME = firehose_config.get('stream_name', FIREHOSE_DEFAULT_STREAM_NAME)
 37 |     FIREHOSE_BUFFER_SIZE = firehose_config['buffer_size_in_mbs']
 38 |     FIREHOSE_BUFFER_INTERVAL = firehose_config['buffer_interval_in_seconds']
 39 |     FIREHOSE_LAMBDA_BUFFER_SIZE = firehose_config['lambda_buffer_size_in_mbs']
 40 |     FIREHOSE_LAMBDA_BUFFER_INTERVAL = firehose_config['lambda_buffer_interval_in_seconds']
 41 |     FIREHOSE_LAMBDA_NUMBER_OF_RETRIES = firehose_config['lambda_number_of_retries']
 42 |     FIREHOSE_TO_S3_PREFIX = firehose_config['prefix']
 43 |     FIREHOSE_TO_S3_ERROR_OUTPUT_PREFIX = firehose_config['error_output_prefix']
 44 |     FIREHOSE_TO_S3_OUTPUT_FOLDER = firehose_config['s3_output_folder']
 45 | 
 46 |     assert f'{FIREHOSE_TO_S3_OUTPUT_FOLDER}/' == FIREHOSE_TO_S3_PREFIX[:len(FIREHOSE_TO_S3_OUTPUT_FOLDER) + 1]
 47 | 
 48 |     firehose_role_policy_doc = aws_iam.PolicyDocument()
 49 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
 50 |       "effect": aws_iam.Effect.ALLOW,
 51 |       "resources": [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)],
 52 |       "actions": ["s3:AbortMultipartUpload",
 53 |         "s3:GetBucketLocation",
 54 |         "s3:GetObject",
 55 |         "s3:ListBucket",
 56 |         "s3:ListBucketMultipartUploads",
 57 |         "s3:PutObject"]
 58 |     }))
 59 | 
 60 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 61 |       effect=aws_iam.Effect.ALLOW,
 62 |       resources=["*"],
 63 |       actions=["ec2:DescribeVpcs",
 64 |         "ec2:DescribeVpcAttribute",
 65 |         "ec2:DescribeSubnets",
 66 |         "ec2:DescribeSecurityGroups",
 67 |         "ec2:DescribeNetworkInterfaces",
 68 |         "ec2:CreateNetworkInterface",
 69 |         "ec2:CreateNetworkInterfacePermission",
 70 |         "ec2:DeleteNetworkInterface"]
 71 |     ))
 72 | 
 73 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 74 |       effect=aws_iam.Effect.ALLOW,
 75 |       resources=["*"],
 76 |       actions=["glue:GetTable",
 77 |         "glue:GetTableVersion",
 78 |         "glue:GetTableVersions"]
 79 |     ))
 80 | 
 81 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 82 |       effect=aws_iam.Effect.ALLOW,
 83 |       resources=[source_kinesis_stream_arn],
 84 |       actions=["kinesis:DescribeStream",
 85 |         "kinesis:GetShardIterator",
 86 |         "kinesis:GetRecords"]
 87 |     ))
 88 | 
 89 |     firehose_log_group_name = f"/aws/kinesisfirehose/{FIREHOSE_STREAM_NAME}"
 90 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(
 91 |       effect=aws_iam.Effect.ALLOW,
 92 |       #XXX: The ARN will be formatted as follows:
 93 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name}
 94 |       resources=[self.format_arn(service="logs", resource="log-group",
 95 |         resource_name="{}:log-stream:*".format(firehose_log_group_name),
 96 |         arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)],
 97 |       actions=["logs:PutLogEvents"]
 98 |     ))
 99 | 
100 |     firehose_role_policy_doc.add_statements(aws_iam.PolicyStatement(**{
101 |       "effect": aws_iam.Effect.ALLOW,
102 |       #XXX: The ARN will be formatted as follows:
103 |       # arn:{partition}:{service}:{region}:{account}:{resource}{sep}}{resource-name}
104 |       "resources": [self.format_arn(partition="aws", service="lambda",
105 |         region=cdk.Aws.REGION, account=cdk.Aws.ACCOUNT_ID, resource="function",
106 |         resource_name="{}:*".format(data_transform_lambda_fn.function_name),
107 |         arn_format=cdk.ArnFormat.COLON_RESOURCE_NAME)],
108 |       "actions": ["lambda:InvokeFunction",
109 |         "lambda:GetFunctionConfiguration"]
110 |     }))
111 | 
112 |     firehose_role = aws_iam.Role(self, "KinesisFirehoseDeliveryRole",
113 |       role_name="KinesisFirehoseServiceRole-{stream_name}-{region}".format(
114 |         stream_name=FIREHOSE_STREAM_NAME, region=cdk.Aws.REGION),
115 |       assumed_by=aws_iam.ServicePrincipal("firehose.amazonaws.com"),
116 |       #XXX: use inline_policies to work around https://github.com/aws/aws-cdk/issues/5221
117 |       inline_policies={
118 |         "firehose_role_policy": firehose_role_policy_doc
119 |       }
120 |     )
121 | 
122 |     lambda_proc = cfn.ProcessorProperty(
123 |       type="Lambda",
124 |       parameters=[
125 |         cfn.ProcessorParameterProperty(
126 |           parameter_name="LambdaArn",
127 |           # parameter_value='{}:{}'.format(schema_validator_lambda_fn.function_arn, schema_validator_lambda_fn.current_version.version)
128 |           parameter_value='{}:{}'.format(data_transform_lambda_fn.function_arn, data_transform_lambda_fn.latest_version.version)
129 |         ),
130 |         cfn.ProcessorParameterProperty(
131 |           parameter_name="NumberOfRetries",
132 |           parameter_value=str(FIREHOSE_LAMBDA_NUMBER_OF_RETRIES)
133 |         ),
134 |         cfn.ProcessorParameterProperty(
135 |           parameter_name="RoleArn",
136 |           parameter_value=firehose_role.role_arn
137 |         ),
138 |         cfn.ProcessorParameterProperty(
139 |           parameter_name="BufferSizeInMBs",
140 |           parameter_value=str(FIREHOSE_LAMBDA_BUFFER_SIZE)
141 |         ),
142 |         cfn.ProcessorParameterProperty(
143 |           parameter_name="BufferIntervalInSeconds",
144 |           parameter_value=str(FIREHOSE_LAMBDA_BUFFER_INTERVAL)
145 |         )
146 |       ]
147 |     )
148 | 
149 |     firehose_processing_config = cfn.ProcessingConfigurationProperty(
150 |       enabled=True,
151 |       processors=[
152 |         lambda_proc
153 |       ]
154 |     )
155 | 
156 |     ext_s3_dest_config = cfn.ExtendedS3DestinationConfigurationProperty(
157 |       bucket_arn=s3_bucket.bucket_arn,
158 |       role_arn=firehose_role.role_arn,
159 |       buffering_hints={
160 |         "intervalInSeconds": FIREHOSE_BUFFER_INTERVAL,
161 |         "sizeInMBs": FIREHOSE_BUFFER_SIZE
162 |       },
163 |       cloud_watch_logging_options={
164 |         "enabled": True,
165 |         "logGroupName": firehose_log_group_name,
166 |         "logStreamName": "S3Delivery"
167 |       },
168 |       compression_format="UNCOMPRESSED", # [GZIP | HADOOP_SNAPPY | Snappy | UNCOMPRESSED | ZIP]
169 |       data_format_conversion_configuration={
170 |         "enabled": False
171 |       },
172 |       dynamic_partitioning_configuration={
173 |         "enabled": False
174 |       },
175 |       error_output_prefix=FIREHOSE_TO_S3_ERROR_OUTPUT_PREFIX,
176 |       prefix=FIREHOSE_TO_S3_PREFIX,
177 |       processing_configuration=firehose_processing_config
178 |     )
179 | 
180 |     firehose_to_s3_delivery_stream = aws_kinesisfirehose.CfnDeliveryStream(self, "KinesisFirehoseToS3",
181 |       delivery_stream_name=FIREHOSE_STREAM_NAME,
182 |       delivery_stream_type="KinesisStreamAsSource",
183 |       kinesis_stream_source_configuration={
184 |         "kinesisStreamArn": source_kinesis_stream_arn,
185 |         "roleArn": firehose_role.role_arn
186 |       },
187 |       extended_s3_destination_configuration=ext_s3_dest_config
188 |     )
189 | 
190 |     self.s3_dest_bucket_name = s3_bucket.bucket_name
191 |     self.s3_dest_folder_name = FIREHOSE_TO_S3_OUTPUT_FOLDER
192 | 
193 |     cdk.CfnOutput(self, 'S3DestBucket',
194 |       value=s3_bucket.bucket_name,
195 |       export_name=f'{self.stack_name}-S3DestBucket')
196 |     cdk.CfnOutput(self, 'KinesisDataFirehoseName',
197 |       value=firehose_to_s3_delivery_stream.delivery_stream_name,
198 |       export_name=f'{self.stack_name}-KinesisDataFirehoseName')
199 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/firehose_dtata_transform_lambda.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lambda,
10 |   aws_logs,
11 |   aws_s3 as s3
12 | )
13 | from constructs import Construct
14 | 
15 | 
16 | class FirehoseDataTransformLambdaStack(Stack):
17 | 
18 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
19 |     super().__init__(scope, construct_id, **kwargs)
20 | 
21 |     firehose_data_transform_lambda_config = self.node.try_get_context('firehose_data_tranform_lambda')
22 |     LAMBDA_LAYER_CODE_S3_BUCKET = firehose_data_transform_lambda_config['s3_bucket_name']
23 |     LAMBDA_LAYER_CODE_S3_OBJ_KEY = firehose_data_transform_lambda_config['s3_object_key']
24 | 
25 |     s3_lambda_layer_lib_bucket = s3.Bucket.from_bucket_name(self, "LambdaLayerS3Bucket", LAMBDA_LAYER_CODE_S3_BUCKET)
26 |     lambda_lib_layer = aws_lambda.LayerVersion(self, "SchemaValidatorLib",
27 |       layer_version_name="fastavro-lib",
28 |       compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11],
29 |       code=aws_lambda.Code.from_bucket(s3_lambda_layer_lib_bucket, LAMBDA_LAYER_CODE_S3_OBJ_KEY)
30 |     )
31 | 
32 |     SCHEMA_VALIDATOR_LAMBDA_FN_NAME = "SchemaValidator"
33 |     schema_validator_lambda_fn = aws_lambda.Function(self, "SchemaValidator",
34 |       runtime=aws_lambda.Runtime.PYTHON_3_11,
35 |       function_name=SCHEMA_VALIDATOR_LAMBDA_FN_NAME,
36 |       handler="schema_validator.lambda_handler",
37 |       description="Check if records have valid schema",
38 |       code=aws_lambda.Code.from_asset('./src/main/python/SchemaValidator'),
39 |       timeout=cdk.Duration.minutes(5),
40 |       #XXX: set memory size appropriately
41 |       memory_size=256,
42 |       layers=[lambda_lib_layer]
43 |     )
44 | 
45 |     log_group = aws_logs.LogGroup(self, "SchemaValidatorLogGroup",
46 |       #XXX: Circular dependency between resources occurs
47 |       # if aws_lambda.Function.function_name is used
48 |       # instead of literal name of lambda function such as "SchemaValidator"
49 |       log_group_name="/aws/lambda/{}".format(SCHEMA_VALIDATOR_LAMBDA_FN_NAME),
50 |       retention=aws_logs.RetentionDays.THREE_DAYS,
51 |       removal_policy=cdk.RemovalPolicy.DESTROY
52 |     )
53 |     log_group.grant_write(schema_validator_lambda_fn)
54 | 
55 |     self.schema_validator_lambda_fn = schema_validator_lambda_fn
56 | 
57 |     cdk.CfnOutput(self, 'FirehoseDataTransformFuncName',
58 |       value=self.schema_validator_lambda_fn.function_name,
59 |       export_name=f'{self.stack_name}-FirehoseDataTransformFuncName')
60 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/glue_catalog_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_glue
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class GlueCatalogDatabaseStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     athena_database_info = self.node.try_get_context('merge_small_files_lambda_env')
20 |     old_database_name = athena_database_info['OLD_DATABASE']
21 |     new_database_name = athena_database_info['NEW_DATABASE']
22 | 
23 |     for idx, database_name in enumerate(list(set([old_database_name, new_database_name]))):
24 |       cfn_database = aws_glue.CfnDatabase(self, f"GlueCfnDatabase{idx}",
25 |         catalog_id=cdk.Aws.ACCOUNT_ID,
26 |         database_input=aws_glue.CfnDatabase.DatabaseInputProperty(
27 |           name=database_name
28 |         )
29 |       )
30 |       cfn_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
31 | 
32 |       cdk.CfnOutput(self, f'GlueDatabaseName{idx}',
33 |         value=cfn_database.database_input.name,
34 |         export_name=f'{self.stack_name}-GlueDatabaseName{idx}')
35 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/kds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import random
 6 | import string
 7 | 
 8 | import aws_cdk as cdk
 9 | 
10 | from aws_cdk import (
11 |   Duration,
12 |   Stack,
13 |   aws_kinesis,
14 | )
15 | from constructs import Construct
16 | 
17 | random.seed(31)
18 | 
19 | 
20 | class KdsStack(Stack):
21 | 
22 |   def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
23 |     super().__init__(scope, construct_id, **kwargs)
24 | 
25 |     KINESIS_DEFAULT_STREAM_NAME = 'PUT-Firehose-{}'.format(''.join(random.sample((string.ascii_letters), k=5)))
26 |     KINESIS_STREAM_NAME = self.node.try_get_context('kinesis_stream_name') or KINESIS_DEFAULT_STREAM_NAME
27 | 
28 |     source_kinesis_stream = aws_kinesis.Stream(self, "SourceKinesisStreams",
29 |       retention_period=Duration.hours(24),
30 |       stream_mode=aws_kinesis.StreamMode.ON_DEMAND,
31 |       stream_name=KINESIS_STREAM_NAME)
32 | 
33 |     self.target_kinesis_stream = source_kinesis_stream
34 | 
35 |     cdk.CfnOutput(self, 'KinesisDataStreamName',
36 |       value=self.target_kinesis_stream.stream_name,
37 |       export_name=f'{self.stack_name}-KinesisDataStreamName')
38 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/lake_formation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import aws_cdk as cdk
 6 | 
 7 | from aws_cdk import (
 8 |   Stack,
 9 |   aws_lakeformation
10 | )
11 | from constructs import Construct
12 | 
13 | 
14 | class DataLakePermissionsStack(Stack):
15 | 
16 |   def __init__(self, scope: Construct, construct_id: str, job_role, **kwargs) -> None:
17 |     super().__init__(scope, construct_id, **kwargs)
18 | 
19 |     athena_database_info = self.node.try_get_context('merge_small_files_lambda_env')
20 |     old_database_name = athena_database_info['OLD_DATABASE']
21 |     new_database_name = athena_database_info['NEW_DATABASE']
22 | 
23 |     database_list = list(set([old_database_name, new_database_name]))
24 | 
25 |     #XXXX: The role assumed by cdk is not a data lake administrator.
26 |     # So, deploying PrincipalPermissions meets the error such as:
27 |     # "Resource does not exist or requester is not authorized to access requested permissions."
28 |     # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
29 |     # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
30 |     cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
31 |       admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
32 |         data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
33 |       )]
34 |     )
35 | 
36 |     for idx, database_name in enumerate(database_list):
37 |       lf_permissions_on_database = aws_lakeformation.CfnPrincipalPermissions(self, f"LFPermissionsOnDatabase{idx}",
38 |         permissions=["CREATE_TABLE", "DROP", "ALTER", "DESCRIBE"],
39 |         permissions_with_grant_option=[],
40 |         principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
41 |           data_lake_principal_identifier=job_role.role_arn
42 |         ),
43 |         resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
44 |           database=aws_lakeformation.CfnPrincipalPermissions.DatabaseResourceProperty(
45 |             catalog_id=cdk.Aws.ACCOUNT_ID,
46 |             name=database_name
47 |           )
48 |         )
49 |       )
50 |       lf_permissions_on_database.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
51 | 
52 |       #XXX: In order to keep resource destruction order,
53 |       # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
54 |       lf_permissions_on_database.add_dependency(cfn_data_lake_settings)
55 | 
56 |     for idx, database_name in enumerate(database_list):
57 |       lf_permissions_on_table = aws_lakeformation.CfnPrincipalPermissions(self, f"LFPermissionsOnTable{idx}",
58 |         permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
59 |         permissions_with_grant_option=[],
60 |         principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
61 |           data_lake_principal_identifier=job_role.role_arn
62 |         ),
63 |         resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
64 |           #XXX: Can't specify a TableWithColumns resource and a Table resource
65 |           table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
66 |             catalog_id=cdk.Aws.ACCOUNT_ID,
67 |             database_name=database_name,
68 |             table_wildcard={}
69 |           )
70 |         )
71 |       )
72 |       lf_permissions_on_table.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
73 |       lf_permissions_on_table.add_dependency(cfn_data_lake_settings)
74 | 
75 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/cdk_stacks/merge_small_files_lambda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import os
  6 | 
  7 | import aws_cdk as cdk
  8 | 
  9 | from aws_cdk import (
 10 |   Stack,
 11 |   aws_iam,
 12 |   aws_lambda,
 13 |   aws_logs,
 14 |   aws_events,
 15 |   aws_events_targets
 16 | )
 17 | from constructs import Construct
 18 | 
 19 | 
 20 | class MergeSmallFilesLambdaStack(Stack):
 21 | 
 22 |   def __init__(self, scope: Construct, construct_id: str, s3_bucket_name, s3_folder_name, athena_work_group, **kwargs) -> None:
 23 |     super().__init__(scope, construct_id, **kwargs)
 24 | 
 25 |     _lambda_env = self.node.try_get_context('merge_small_files_lambda_env')
 26 | 
 27 |     LAMBDA_ENV_VARS = [
 28 |       'OLD_DATABASE',
 29 |       'OLD_TABLE_NAME',
 30 |       'NEW_DATABASE',
 31 |       'NEW_TABLE_NAME',
 32 |       'ATHENA_WORK_GROUP',
 33 |       'OLD_TABLE_LOCATION_PREFIX',
 34 |       'OUTPUT_PREFIX',
 35 |       'STAGING_OUTPUT_PREFIX',
 36 |       'COLUMN_NAMES'
 37 |     ]
 38 | 
 39 |     lambda_fn_env = {k: v for k, v in _lambda_env.items() if k in LAMBDA_ENV_VARS}
 40 |     additional_lambda_fn_env = {
 41 |       'ATHENA_WORK_GROUP': athena_work_group,
 42 |       'OLD_TABLE_LOCATION_PREFIX': f"s3://{os.path.join(s3_bucket_name, s3_folder_name)}",
 43 |       'OUTPUT_PREFIX': f"s3://{os.path.join(s3_bucket_name, _lambda_env['NEW_TABLE_S3_FOLDER_NAME'])}",
 44 |       'STAGING_OUTPUT_PREFIX': f"s3://{os.path.join(s3_bucket_name, 'tmp')}",
 45 |       'REGION_NAME': cdk.Aws.REGION
 46 |     }
 47 |     lambda_fn_env.update(additional_lambda_fn_env)
 48 | 
 49 |     self.s3_json_location, self.s3_parquet_location = (lambda_fn_env['OLD_TABLE_LOCATION_PREFIX'], lambda_fn_env['OUTPUT_PREFIX'])
 50 | 
 51 |     merge_small_files_lambda_fn = aws_lambda.Function(self, "MergeSmallFiles",
 52 |       runtime=aws_lambda.Runtime.PYTHON_3_11,
 53 |       function_name="MergeSmallFiles",
 54 |       handler="athena_ctas.lambda_handler",
 55 |       description="Merge small files in S3",
 56 |       code=aws_lambda.Code.from_asset('./src/main/python/MergeSmallFiles'),
 57 |       environment=lambda_fn_env,
 58 |       timeout=cdk.Duration.minutes(5)
 59 |     )
 60 | 
 61 |     merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement(
 62 |       effect=aws_iam.Effect.ALLOW,
 63 |       resources=["*"],
 64 |       actions=["athena:*"]))
 65 | 
 66 |     merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement(
 67 |       effect=aws_iam.Effect.ALLOW,
 68 |       resources=["*"],
 69 |       actions=["s3:Get*",
 70 |         "s3:List*",
 71 |         "s3:AbortMultipartUpload",
 72 |         "s3:PutObject",
 73 |       ]))
 74 | 
 75 |     merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement(
 76 |       effect=aws_iam.Effect.ALLOW,
 77 |       resources=["*"],
 78 |       actions=["glue:CreateDatabase",
 79 |         "glue:DeleteDatabase",
 80 |         "glue:GetDatabase",
 81 |         "glue:GetDatabases",
 82 |         "glue:UpdateDatabase",
 83 |         "glue:CreateTable",
 84 |         "glue:DeleteTable",
 85 |         "glue:BatchDeleteTable",
 86 |         "glue:UpdateTable",
 87 |         "glue:GetTable",
 88 |         "glue:GetTables",
 89 |         "glue:BatchCreatePartition",
 90 |         "glue:CreatePartition",
 91 |         "glue:DeletePartition",
 92 |         "glue:BatchDeletePartition",
 93 |         "glue:UpdatePartition",
 94 |         "glue:GetPartition",
 95 |         "glue:GetPartitions",
 96 |         "glue:BatchGetPartition"
 97 |       ]))
 98 | 
 99 |     merge_small_files_lambda_fn.add_to_role_policy(aws_iam.PolicyStatement(
100 |       effect=aws_iam.Effect.ALLOW,
101 |       resources=["*"],
102 |       actions=["lakeformation:GetDataAccess"]))
103 | 
104 |     lambda_fn_target = aws_events_targets.LambdaFunction(merge_small_files_lambda_fn)
105 |     aws_events.Rule(self, "ScheduleRule",
106 |       schedule=aws_events.Schedule.cron(minute="10"),
107 |       targets=[lambda_fn_target]
108 |     )
109 | 
110 |     log_group = aws_logs.LogGroup(self, "MergeSmallFilesLogGroup",
111 |       log_group_name=f"/aws/lambda/{self.stack_name}/MergeSmallFiles",
112 |       removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: for testing
113 |       retention=aws_logs.RetentionDays.THREE_DAYS)
114 |     log_group.grant_write(merge_small_files_lambda_fn)
115 | 
116 |     self.lambda_exec_role = merge_small_files_lambda_fn.role
117 | 
118 | 
119 |     cdk.CfnOutput(self, 'MergeFilesFuncName',
120 |       value=merge_small_files_lambda_fn.function_name,
121 |       export_name=f'{self.stack_name}-MergeFilesLambdaFuncName')
122 |     cdk.CfnOutput(self, 'LambdaExecRoleArn',
123 |       value=self.lambda_exec_role.role_arn,
124 |       export_name=f'{self.stack_name}-LambdaExecRoleArn')
125 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.24.41
2 | mimesis==18.0.0
3 | requests>=2.31.0
4 | 
5 | # packages for Lambda Layer
6 | fastavro==1.10.0
7 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.139.1
2 | constructs>=10.0.0,<11.0.0
3 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/src/main/python/MergeSmallFiles/athena_ctas.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import sys
  6 | import os
  7 | import datetime
  8 | import time
  9 | import random
 10 | 
 11 | import boto3
 12 | 
 13 | random.seed(47)
 14 | 
 15 | DRY_RUN = (os.getenv('DRY_RUN', 'false').lower() == 'true')
 16 | AWS_REGION = os.getenv('REGION_NAME', 'us-east-1')
 17 | 
 18 | OLD_DATABASE = os.getenv('OLD_DATABASE')
 19 | OLD_TABLE_NAME = os.getenv('OLD_TABLE_NAME')
 20 | NEW_DATABASE = os.getenv('NEW_DATABASE')
 21 | NEW_TABLE_NAME = os.getenv('NEW_TABLE_NAME')
 22 | WORK_GROUP = os.getenv('ATHENA_WORK_GROUP', 'primary')
 23 | OLD_TABLE_LOCATION_PREFIX = os.getenv('OLD_TABLE_LOCATION_PREFIX')
 24 | OUTPUT_PREFIX = os.getenv('OUTPUT_PREFIX')
 25 | STAGING_OUTPUT_PREFIX = os.getenv('STAGING_OUTPUT_PREFIX')
 26 | COLUMN_NAMES = os.getenv('COLUMN_NAMES', '*')
 27 | 
 28 | EXTERNAL_LOCATION_FMT = '''{output_prefix}/year={year}/month={month:02}/day={day:02}/hour={hour:02}/'''
 29 | 
 30 | CTAS_QUERY_FMT = '''CREATE TABLE {new_database}.tmp_{new_table_name}
 31 | WITH (
 32 |   external_location='{location}',
 33 |   format = 'PARQUET',
 34 |   parquet_compression = 'SNAPPY')
 35 | AS SELECT {columns}
 36 | FROM {old_database}.{old_table_name}
 37 | WHERE year={year} AND month={month} AND day={day} AND hour={hour}
 38 | WITH DATA
 39 | '''
 40 | 
 41 | def run_alter_table_add_partition(athena_client, basic_dt, database_name, table_name, output_prefix):
 42 |   year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour)
 43 | 
 44 |   tmp_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=table_name,
 45 |       year=year, month=month, day=day, hour=hour)
 46 | 
 47 |   output_location = '{}/alter_table_{}'.format(STAGING_OUTPUT_PREFIX, tmp_table_name)
 48 | 
 49 |   alter_table_stmt = '''ALTER TABLE {database}.{table_name} ADD if NOT EXISTS'''.format(database=database_name,
 50 |     table_name=table_name)
 51 | 
 52 |   partition_expr = '''PARTITION (year={year}, month={month}, day={day}, hour={hour}) LOCATION "{output_prefix}/year={year}/month={month:02}/day={day:02}/hour={hour:02}/"'''
 53 | 
 54 |   partition_expr_list = []
 55 |   for i in (1, 0, -1):
 56 |      dt = basic_dt - datetime.timedelta(hours=i)
 57 |      year, month, day, hour = (dt.year, dt.month, dt.day, dt.hour)
 58 |      part_expr = partition_expr.format(year=year, month=month, day=day, hour=hour, output_prefix=output_prefix)
 59 |      partition_expr_list.append(part_expr)
 60 | 
 61 |   query = '{} {}'.format(alter_table_stmt, '\n'.join(partition_expr_list))
 62 |   print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr)
 63 |   print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr)
 64 | 
 65 |   if DRY_RUN:
 66 |     print('[INFO] End of dry-run', file=sys.stderr)
 67 |     return
 68 | 
 69 |   response = athena_client.start_query_execution(
 70 |     QueryString=query,
 71 |     ResultConfiguration={
 72 |       'OutputLocation': output_location
 73 |     },
 74 |     WorkGroup=WORK_GROUP
 75 |   )
 76 |   print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr)
 77 | 
 78 | 
 79 | def run_drop_tmp_table(athena_client, basic_dt):
 80 |   year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour)
 81 | 
 82 |   tmp_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=NEW_TABLE_NAME,
 83 |       year=year, month=month, day=day, hour=hour)
 84 | 
 85 |   output_location = '{}/tmp_{}'.format(STAGING_OUTPUT_PREFIX, tmp_table_name)
 86 |   query = 'DROP TABLE IF EXISTS {database}.tmp_{table_name}'.format(database=NEW_DATABASE,
 87 |       table_name=tmp_table_name)
 88 | 
 89 |   print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr)
 90 |   print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr)
 91 | 
 92 |   if DRY_RUN:
 93 |     print('[INFO] End of dry-run', file=sys.stderr)
 94 |     return
 95 | 
 96 |   response = athena_client.start_query_execution(
 97 |     QueryString=query,
 98 |     ResultConfiguration={
 99 |       'OutputLocation': output_location
100 |     },
101 |     WorkGroup=WORK_GROUP
102 |   )
103 |   print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr)
104 | 
105 | 
106 | def run_ctas(athena_client, basic_dt):
107 |   year, month, day, hour = (basic_dt.year, basic_dt.month, basic_dt.day, basic_dt.hour)
108 | 
109 |   new_table_name = '{table}_{year}{month:02}{day:02}{hour:02}'.format(table=NEW_TABLE_NAME,
110 |     year=year, month=month, day=day, hour=hour)
111 | 
112 |   output_location = '{}/tmp_{}'.format(STAGING_OUTPUT_PREFIX, new_table_name)
113 |   external_location = EXTERNAL_LOCATION_FMT.format(output_prefix=OUTPUT_PREFIX,
114 |     year=year, month=month, day=day, hour=hour)
115 | 
116 |   query = CTAS_QUERY_FMT.format(new_database=NEW_DATABASE, new_table_name=new_table_name,
117 |     old_database=OLD_DATABASE, old_table_name=OLD_TABLE_NAME, columns=COLUMN_NAMES,
118 |     year=year, month=month, day=day, hour=hour, location=external_location)
119 | 
120 |   print('[INFO] QueryString:\n{}'.format(query), file=sys.stderr)
121 |   print('[INFO] ExternalLocation: {}'.format(external_location), file=sys.stderr)
122 |   print('[INFO] OutputLocation: {}'.format(output_location), file=sys.stderr)
123 | 
124 |   if DRY_RUN:
125 |     print('[INFO] End of dry-run', file=sys.stderr)
126 |     return
127 | 
128 |   response = athena_client.start_query_execution(
129 |     QueryString=query,
130 |     QueryExecutionContext={
131 |       'Database': NEW_DATABASE
132 |     },
133 |     ResultConfiguration={
134 |       'OutputLocation': output_location
135 |     },
136 |     WorkGroup=WORK_GROUP
137 |   )
138 |   print('[INFO] QueryExecutionId: {}'.format(response['QueryExecutionId']), file=sys.stderr)
139 | 
140 | 
141 | def lambda_handler(event, context):
142 |   event_dt = datetime.datetime.strptime(event['time'], "%Y-%m-%dT%H:%M:%SZ")
143 |   prev_basic_dt, basic_dt = [event_dt - datetime.timedelta(hours=e) for e in (2, 1)]
144 | 
145 |   client = boto3.client('athena', region_name=AWS_REGION)
146 |   run_drop_tmp_table(client, prev_basic_dt)
147 | 
148 |   if not DRY_RUN:
149 |     print('[INFO] Wait for a few seconds until dropping old table', file=sys.stderr)
150 |     time.sleep(10)
151 | 
152 |   run_alter_table_add_partition(client, basic_dt,
153 |     database_name=OLD_DATABASE,
154 |     table_name=OLD_TABLE_NAME,
155 |     output_prefix=OLD_TABLE_LOCATION_PREFIX)
156 | 
157 |   if not DRY_RUN:
158 |     print('[INFO] Wait for a few seconds until adding partitions to table: %s.%s' % (OLD_DATABASE, OLD_TABLE_NAME), file=sys.stderr)
159 |     time.sleep(10)
160 | 
161 |   run_alter_table_add_partition(client, basic_dt,
162 |     database_name=NEW_DATABASE,
163 |     table_name=NEW_TABLE_NAME,
164 |     output_prefix=OUTPUT_PREFIX)
165 | 
166 |   if not DRY_RUN:
167 |     print('[INFO] Wait for a few seconds until adding partitions to table: %s.%s' % (NEW_DATABASE, NEW_TABLE_NAME), file=sys.stderr)
168 |     time.sleep(10)
169 | 
170 |   run_ctas(client, basic_dt)
171 | 
172 | 
173 | if __name__ == '__main__':
174 |   import argparse
175 | 
176 |   parser = argparse.ArgumentParser()
177 |   parser.add_argument('-dt', '--basic-datetime', default=datetime.datetime.today().strftime('%Y-%m-%dT%H:05:00Z'),
178 |     help='The scheduled event occurrence time ex) 2020-02-28T03:05:00Z')
179 |   parser.add_argument('--region-name', default='us-east-1',
180 |     help='aws region name')
181 |   parser.add_argument('--old-database', default='mydatabase',
182 |     help='aws athena source database name used by ctas query')
183 |   parser.add_argument('--old-table-name', default='web_log_json',
184 |     help='aws athena source table name used by ctas query')
185 |   parser.add_argument('--new-database', default='mydatabase',
186 |     help='aws athena target database name for merged files')
187 |   parser.add_argument('--new-table-name', default='ctas_web_log_parquet',
188 |     help='aws athena target table name for merged files')
189 |   parser.add_argument('--work-group', default='primary',
190 |     help='aws athena work group')
191 |   parser.add_argument('--old-table-location-prefix', required=True,
192 |     help='s3 path for aws athena source table')
193 |   parser.add_argument('--output-prefix', required=True,
194 |     help='s3 path for aws athena target table')
195 |   parser.add_argument('--staging-output-prefix', required=True,
196 |     help='s3 path for aws athena tmp table')
197 |   parser.add_argument('--column-names', default='*',
198 |     help='selectable column names of aws athena source table')
199 |   parser.add_argument('--run', action='store_true',
200 |     help='run ctas query')
201 | 
202 |   options = parser.parse_args()
203 | 
204 |   DRY_RUN = False if options.run else True
205 |   AWS_REGION = options.region_name
206 |   OLD_DATABASE = options.old_database
207 |   OLD_TABLE_NAME= options.old_table_name
208 |   NEW_DATABASE = options.new_database
209 |   NEW_TABLE_NAME = options.new_table_name
210 |   WORK_GROUP = options.work_group
211 |   OLD_TABLE_LOCATION_PREFIX = options.old_table_location_prefix
212 |   OUTPUT_PREFIX = options.output_prefix
213 |   STAGING_OUTPUT_PREFIX = options.staging_output_prefix
214 |   COLUMN_NAMES = options.column_names
215 | 
216 |   event = {
217 |     "id": "cdc73f9d-aea9-11e3-9d5a-835b769c0d9c",
218 |     "detail-type": "Scheduled Event",
219 |     "source": "aws.events",
220 |     "account": "123456789012",
221 |     "time": options.basic_datetime, # ex) "2020-02-28T03:05:00Z"
222 |     "region": AWS_REGION, # ex) "us-east-1",
223 |     "resources": [
224 |       f"arn:aws:events:{AWS_REGION}:123456789012:rule/ExampleRule"
225 |     ],
226 |     "detail": {}
227 |   }
228 |   print('[DEBUG] event:\n{}'.format(event), file=sys.stderr)
229 |   lambda_handler(event, {})
230 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/src/main/python/SchemaValidator/schema_validator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | #vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
  4 | 
  5 | import base64
  6 | import json
  7 | import logging
  8 | import collections
  9 | from datetime import datetime
 10 | 
 11 | import fastavro
 12 | 
 13 | LOGGER = logging.getLogger()
 14 | if len(LOGGER.handlers) > 0:
 15 |   # The Lambda environment pre-configures a handler logging to stderr.
 16 |   # If a handler is already configured, `.basicConfig` does not execute.
 17 |   # Thus we set the level directly.
 18 |   LOGGER.setLevel(logging.INFO)
 19 | else:
 20 |   logging.basicConfig(level=logging.INFO)
 21 | 
 22 | 
 23 | ORIGINAL_SCHEMA = {
 24 |   'name': 'WebLogs',
 25 |   'type': 'record',
 26 |   'fields': [
 27 |     {
 28 |       'name': 'userId',
 29 |       'type': 'string'
 30 |     },
 31 |     {
 32 |       'name': 'sessionId',
 33 |       'type': 'string'
 34 |     },
 35 |     {
 36 |       'name': 'referrer',
 37 |       'type': ['string', 'null']
 38 |     },
 39 |     {
 40 |       'name': 'userAgent',
 41 |       'type': ['string', 'null']
 42 |     },
 43 |     {
 44 |       'name': 'ip',
 45 |       'type': 'string'
 46 |     },
 47 |     {
 48 |       'name': 'hostname',
 49 |       'type': 'string'
 50 |     },
 51 |     {
 52 |       'name': 'os',
 53 |       'type': ['string', 'null']
 54 |     },
 55 |     {
 56 |       'name': 'timestamp',
 57 |       'type': {
 58 |         'type': 'string',
 59 |         'logicalType': 'datetime'
 60 |       }
 61 |     },
 62 |     {
 63 |       'name': 'uri',
 64 |       'type': 'string'
 65 |     }
 66 |   ]
 67 | }
 68 | 
 69 | 
 70 | def read_datetime(data, writer_schema=None, reader_schema=None):
 71 |   return datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 72 | 
 73 | def prepare_datetime(data, schema):
 74 |   """Converts datetime.datetime to string representing the date and time"""
 75 |   if isinstance(data, datetime):
 76 |     return datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
 77 |   else:
 78 |     try:
 79 |       dt = datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
 80 |       return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
 81 |     except Exception as ex:
 82 |       return None
 83 | 
 84 | fastavro.read.LOGICAL_READERS["string-datetime"] = read_datetime
 85 | fastavro.write.LOGICAL_WRITERS["string-datetime"] = prepare_datetime
 86 | 
 87 | PARSED_SCHEMA = fastavro.parse_schema(ORIGINAL_SCHEMA)
 88 | 
 89 | def check_schema(record):
 90 |   try:
 91 |     return fastavro.validation.validate(record, PARSED_SCHEMA, raise_errors=False)
 92 |   except Exception as ex:
 93 |     LOGGER.error(ex)
 94 |     return False
 95 | 
 96 | # Signature for all Lambda functions that user must implement
 97 | def lambda_handler(firehose_records_input, context):
 98 |   LOGGER.debug("Received records for processing from DeliveryStream: {deliveryStreamArn}, Region: {region}, and InvocationId: {invocationId}".format(
 99 |     deliveryStreamArn=firehose_records_input['deliveryStreamArn'],
100 |     region=firehose_records_input['region'],
101 |     invocationId=firehose_records_input['invocationId']))
102 | 
103 |   # Create return value.
104 |   firehose_records_output = {'records': []}
105 | 
106 |   counter = collections.Counter(total=0, valid=0, invalid=0)
107 | 
108 |   # Create result object.
109 |   # Go through records and process them
110 |   for firehose_record_input in firehose_records_input['records']:
111 |     counter['total'] += 1
112 | 
113 |     # Get user payload
114 |     payload = base64.b64decode(firehose_record_input['data'])
115 |     json_value = json.loads(payload)
116 | 
117 |     LOGGER.debug("Record that was received: {}".format(json_value))
118 | 
119 |     #TODO: check if schema is valid
120 |     is_valid = check_schema(json_value)
121 |     counter['valid' if is_valid else 'invalid'] += 1
122 | 
123 |     # Create output Firehose record and add modified payload and record ID to it.
124 |     firehose_record_output = {
125 |       'recordId': firehose_record_input['recordId'],
126 |       #XXX: convert JSON to JSONLine
127 |       'data': base64.b64encode(payload.rstrip(b'\n') + b'\n'),
128 | 
129 |       # The status of the data transformation of the record.
130 |       # The possible values are: 
131 |       #  Ok (the record was transformed successfully),
132 |       #  Dropped (the record was dropped intentionally by your processing logic),
133 |       # and ProcessingFailed (the record could not be transformed).
134 |       # If a record has a status of Ok or Dropped, Kinesis Data Firehose considers it successfully processed.
135 |       #  Otherwise, Kinesis Data Firehose considers it unsuccessfully processed.
136 | 
137 |       # 'ProcessFailed' record will be put into error bucket in S3
138 |       'result': 'Ok' if is_valid else 'ProcessingFailed' # [Ok, Dropped, ProcessingFailed]
139 |     }
140 | 
141 |     # Must set proper record ID
142 |     # Add the record to the list of output records.
143 |     firehose_records_output['records'].append(firehose_record_output)
144 | 
145 |   LOGGER.info(', '.join("{}={}".format(k, v) for k, v in counter.items()))
146 | 
147 |   # At the end return processed records
148 |   return firehose_records_output
149 | 
150 | 
151 | if __name__ == '__main__':
152 |   import pprint
153 | 
154 |   record_list = [
155 |     {
156 |       "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720",
157 |       "sessionId": "a5aa20a72c9e37588f9bbeaa",
158 |       "referrer": "brandon.biz",
159 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
160 |       "ip": "202.165.71.49",
161 |       "hostname": "toxic.tokyo",
162 |       "os": "openSUSE",
163 |       "timestamp": "2022-09-16T07:35:46Z",
164 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
165 |     },
166 |     {
167 |       "userId": "70b1f606-aa63-47fb-bc92-76de9c59d064",
168 |       "sessionId": "928e78473db8449b17644b2c",
169 |       # missing optional data
170 |       # "referrer": "toe.gq",
171 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.53",
172 |       "ip": "12.166.113.176",
173 |       "hostname": "drivers.glass",
174 |       "os": "Windows 8.1",
175 |       "timestamp": "2022-09-16T07:52:47Z",
176 |       "uri": "https://aaa.gov/2022/04/29/cialis-prayer-presentations-completed-avenue-vision?trucks=cut&indeed=members"
177 |     },
178 |     {
179 |       "userId": "897bef5f-294d-4ecc-a3b6-ef2844958720",
180 |       "sessionId": "a5aa20a72c9e37588f9bbeaa",
181 |       "referrer": "brandon.biz",
182 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 8.52",
183 |       "ip": "202.165.71.49",
184 |       "hostname": "toxic.tokyo",
185 |       "os": "openSUSE",
186 |       # invalid datetime format
187 |       "timestamp": "2022-09-16 07:35:46",
188 |       "uri": "https://phones.madrid/2012/02/12/bed-federal-in-wireless-scientists-shoes-walker-those-premier-younger?lane=outcomes&acc=memories"
189 |     },
190 |     {
191 |       # missing required data
192 |       # "userId": "045e63c7-b276-4117-9706-7c2e3b87d5f5",
193 |       "sessionId": "abfd47eb7dd7b8aeec0555a7",
194 |       "referrer": "transfer.edu",
195 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; de) Opera 9.50",
196 |       "ip": "170.128.148.234",
197 |       "hostname": "propecia.tc",
198 |       "os": "Lubuntu",
199 |       "timestamp": "2022-09-16T07:46:04Z",
200 |       "uri": "https://pee.cloud/2019/06/15/alan-publish-perl-snow-notification-gap-improvement-guaranteed-changed-determining?casino=admissions&cottage=hotel"
201 |     },
202 |     {
203 |       "userId": "e504cd9d-30da-497f-8f28-2b3f64220e16",
204 |       "sessionId": "fd4807ab825ee8bd950b1e8b",
205 |       "referrer": "liquid.aquitaine",
206 |       "userAgent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.02",
207 |       # mismatched data type
208 |       "ip": 212234672,
209 |       "hostname": "consequently.com",
210 |       "os": "Gentoo",
211 |       "timestamp": "2022-09-16T07:13:29Z",
212 |       "uri": "https://railway.sz/2014/10/30/use-phone-task-marketplace?pot=it&album=cook"
213 |     }
214 |   ]
215 | 
216 |   for record in record_list:
217 |     event = {
218 |       "invocationId": "invocationIdExample",
219 |       "deliveryStreamArn": "arn:aws:kinesis:EXAMPLE",
220 |       "region": "us-east-1",
221 |       "records": [
222 |         {
223 |           "recordId": "49546986683135544286507457936321625675700192471156785154",
224 |           "approximateArrivalTimestamp": 1495072949453,
225 |           "data": base64.b64encode(json.dumps(record).encode('utf-8'))
226 |         }
227 |       ]
228 |     }
229 | 
230 |     res = lambda_handler(event, {})
231 |     for elem in res['records']:
232 |       print(f"[{elem['result']}]")
233 |       print(base64.b64decode(elem['data']).decode('utf-8'))
234 | 
235 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/src/utils/gen_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import sys
 6 | import argparse
 7 | from datetime import (
 8 |   datetime,
 9 |   timezone
10 | )
11 | import json
12 | import time
13 | import typing
14 | 
15 | from mimesis.locales import Locale
16 | from mimesis.schema import Field, Schema
17 | from mimesis.providers.base import BaseProvider
18 | import requests
19 | 
20 | 
21 | class CustomDatetime(BaseProvider):
22 |   class Meta:
23 |     """Class for metadata."""
24 |     name: typing.Final[str] = "custom_datetime"
25 | 
26 |   def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None:
27 |     super().__init__(*args, **kwargs)
28 | 
29 |   def timestamp(self) -> str:
30 |     utc_now = datetime.now(timezone.utc)
31 |     minute = self.random.randint(0, 59)
32 |     second = self.random.randint(0, 59)
33 |     random_datetime = utc_now.replace(minute=minute, second=second)
34 |     return random_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
35 | 
36 | 
37 | def main():
38 |   parser = argparse.ArgumentParser()
39 | 
40 |   parser.add_argument('--api-url', help='log collector api url')
41 |   parser.add_argument('--api-method', default='records', choices=['record', 'records'],
42 |     help='log collector api method [record | records]')
43 |   parser.add_argument('--stream-name', help='kinesis stream name')
44 |   parser.add_argument('--max-count', default=15, type=int, help='max number of records to put')
45 |   parser.add_argument('--dry-run', action='store_true')
46 | 
47 |   options = parser.parse_args()
48 | 
49 |   _field = Field(locale=Locale.EN)
50 |   _field._generic.add_provider(CustomDatetime)
51 | 
52 |   schema_definition = lambda: {
53 |     "userId": _field("uuid"),
54 |     "sessionId": _field("token_hex", entropy=12),
55 |     "referrer": _field("internet.hostname"),
56 |     "userAgent": _field("internet.user_agent"),
57 |     "ip": _field("internet.ip_v4"),
58 |     "hostname": _field("internet.hostname"),
59 |     "os": _field("development.os"),
60 |     "timestamp": _field("custom_datetime.timestamp"),
61 |     "uri": _field("internet.uri", query_params_count=2)
62 |   }
63 |   schema = Schema(schema=schema_definition, iterations=options.max_count)
64 | 
65 |   log_collector_url = f'{options.api_url}/streams/{options.stream_name}/{options.api_method}' if not options.dry_run else None
66 | 
67 |   for record in schema:
68 |     if options.dry_run:
69 |       print(json.dumps(record), file=sys.stderr)
70 |       continue
71 | 
72 |     partition_key = record['userId']
73 |     if options.api_method == 'record':
74 |       data = {'Data': record, 'PartitionKey': partition_key}
75 |       payload = f'{json.dumps(data)}'
76 |     else:
77 |       #XXX: make sure data has newline
78 |       data = {"records":[{'data': f'{json.dumps(record)}\n', 'partition-key': partition_key}]}
79 |       payload = json.dumps(data)
80 | 
81 |     res = requests.put(log_collector_url, data=payload, headers={'Content-Type': 'application/json'})
82 |     if res.status_code == 200:
83 |       print(f'[{res.status_code} {res.reason}]', res.text, file=sys.stderr)
84 |     else:
85 |       print(f'[{res.status_code} {res.reason}]', file=sys.stderr)
86 |       sys.exit(1)
87 |     time.sleep(0.5)
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 


--------------------------------------------------------------------------------
/web-analytics-parquet/src/utils/kds_consumer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab
 4 | 
 5 | import argparse
 6 | import pprint
 7 | import random
 8 | import time
 9 | 
10 | import boto3
11 | 
12 | random.seed(47)
13 | 
14 | SHARD_ITER_TYPE = ('TRIM_HORIZON', 'LATEST')
15 | 
16 | def main():
17 |   parser = argparse.ArgumentParser()
18 | 
19 |   parser.add_argument('--stream-name', action="store", help='kinesis stream name')
20 |   parser.add_argument('--shard-id', action="store", help='kinesis stream shard-id')
21 |   parser.add_argument('--iter-type', choices=SHARD_ITER_TYPE, default='LATEST',
22 |     help='kinesis stream shard iterator type: [{}]'.format(', '.join(SHARD_ITER_TYPE)))
23 |   parser.add_argument('--region-name', action='store', default='us-east-1',
24 |     help='aws region name (default: us-east-1)')
25 | 
26 |   options = parser.parse_args()
27 | 
28 |   stream_name, shard_iter_type = options.stream_name, options.iter_type
29 | 
30 |   kinesis_client = boto3.client('kinesis', region_name=options.region_name)
31 |   response = kinesis_client.describe_stream(StreamName=stream_name)
32 |   if options.shard_id:
33 |     shard_id = options.shard_id
34 |   else:
35 |     shard_id_list = [e['ShardId'] for e in response['StreamDescription']['Shards']]
36 |     shard_id = random.choice(shard_id_list)
37 | 
38 |   shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name,
39 |                                                      ShardId=shard_id,
40 |                                                      ShardIteratorType=shard_iter_type)
41 | 
42 |   shard_iter = shard_iterator['ShardIterator']
43 |   record_response = kinesis_client.get_records(ShardIterator=shard_iter, Limit=123)
44 |   pprint.pprint(record_response.get('Records', []), indent=2)
45 | 
46 |   while 'NextShardIterator' in record_response:
47 |     record_response = kinesis_client.get_records(ShardIterator=record_response['NextShardIterator'], Limit=123)
48 |     pprint.pprint(record_response.get('Records', []), indent=2)
49 | 
50 |     # wait for a few seconds
51 |     time.sleep(5)
52 | 
53 | if __name__ == '__main__':
54 |   main()
55 | 
56 | 


--------------------------------------------------------------------------------