├── LICENSE
├── README.md
├── mkzip.sh
├── src
    ├── enhanced_lambda_metrics.py
    ├── fingerprint.py
    ├── lambda_function.py
    └── main.py
└── tests
    └── test_fingerprint.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2018 Datadog, Inc.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | SPDX-License-Identifier: Apache-2.0
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mysql-slow-query-datadog-lambda
 2 | 
 3 | AWS Lambda function to relay fingerprint-ed / normalized MySQL Slow Query logs to Datadog.
 4 | 
 5 | This function normalizes SQL like below to aggregate metrics on Datadog.
 6 | 
 7 | ```
 8 | SELECT id, name FROM tbl WHERE id = "1000"` => `SELECT id, name FROM tbl WHERE id = ?
 9 | SELECT id, name FROM tbl WHERE id IN (10, 20, 30)` => `SELECT id, name FROM tbl WHERE id IN (?+)
10 | ```
11 | 
12 | # How to use
13 |  
14 | ## Export Slow query logs to CloudWatch Logs
15 | 1. Enable slow_query_log parameter of your RDS database instance
16 | 
17 | 2. Modify your database instance to export slow query
18 | 
19 | ## Create Datadog API Key secrets on Secrets Manager
20 | 1. Create Datadog API Key secrets with **PLAIN TEXT** format
21 | 
22 | You can find your API Key here:
23 | https://app.datadoghq.com/account/settings#api
24 | 
25 | ## Create Lambda function
26 | 1. Download function.zip from github
27 | 
28 | https://github.com/samitani/mysql-slow-query-datadog-lambda/releases
29 | 
30 | 2. Create Lambda function with downloaded function.zip
31 | 
32 | Specify Python3 as Runtime, `main.lambda_handler` as Handler
33 | 
34 | 3. Configure Lambda Environments below
35 | 
36 | | Key                   | Value         |
37 | |:----------------------|:--------------|
38 | | DD_API_KEY_SECRET_ARN	| AWS Secret Manager ARN of Datadog API KEY.<br>eg) `arn:aws:secretsmanager:ap-northeast-1:XXXXXXXXX:secret:DdApiKeySecret-XXXXXXXX` |
39 | | DD_ENHANCED_METRICS   | false         | 
40 | | DD_SITE               | datadoghq.com |
41 | 
42 | 4. Edit IAM role to allow this lambda function to get secrets
43 | 
44 | 5. Create Lambda Subscription filter against your Slow Query log CloudWatch Log groups
45 | 
46 | ## Datadog
47 | Generate Metrics with below Grok parser.
48 | 
49 | ```
50 | SlowLogRule ^(\# Time: (%{date("yyMMdd  H:mm:ss"):date}|%{date("yyMMdd HH:mm:ss"):date})\n+)?\# User@Host: %{notSpace: user1}\[%{notSpace: user2}\] @ (%{notSpace: host}| ) *\[%{regex("[0-9.]*"): ip}\]  Id:[\x20\t]+%{number: id}\n+\# Query_time: %{number: query_time} *Lock_time: %{number: lock_time} *Rows_sent: %{number: rows_sent} *Rows_examined: %{number: rows_examined}\n(SET timestamp=%{number: timestamp};\n+)?%{regex("[a-zA-Z].*"):query}.
51 | ```
52 | 
53 | ## Example
54 | ![image](https://user-images.githubusercontent.com/2655102/80804977-b6e40f00-8bf1-11ea-9529-485646d079c3.png)
55 | ![image](https://user-images.githubusercontent.com/2655102/80805055-ea269e00-8bf1-11ea-9c24-6f13d2314cf1.png)
56 | 
57 | ## Note
58 | `enhanced_lambda_metrics.py` and `lambda_function.py` were borrowed from below Datadog repository.
59 | 
60 | https://github.com/DataDog/datadog-serverless-functions
61 | 


--------------------------------------------------------------------------------
/mkzip.sh:
--------------------------------------------------------------------------------
1 | pushd src
2 | pip3 install requests -t .
3 | zip -r ../function.zip *
4 | popd
5 | 


--------------------------------------------------------------------------------
/src/enhanced_lambda_metrics.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | 
  5 | from collections import defaultdict
  6 | from time import time
  7 | 
  8 | import boto3
  9 | from botocore.exceptions import ClientError
 10 | 
 11 | ENHANCED_METRICS_NAMESPACE_PREFIX = "aws.lambda.enhanced"
 12 | 
 13 | TAGS_CACHE_TTL_SECONDS = 3600
 14 | 
 15 | # Latest Lambda pricing per https://aws.amazon.com/lambda/pricing/
 16 | BASE_LAMBDA_INVOCATION_PRICE = 0.0000002
 17 | LAMBDA_PRICE_PER_GB_SECOND = 0.0000166667
 18 | 
 19 | ESTIMATED_COST_METRIC_NAME = "estimated_cost"
 20 | 
 21 | GET_RESOURCES_LAMBDA_FILTER = "lambda"
 22 | 
 23 | 
 24 | # Names to use for metrics and for the named regex groups
 25 | REQUEST_ID_FIELD_NAME = "request_id"
 26 | DURATION_METRIC_NAME = "duration"
 27 | BILLED_DURATION_METRIC_NAME = "billed_duration"
 28 | MEMORY_ALLOCATED_FIELD_NAME = "memorysize"
 29 | MAX_MEMORY_USED_METRIC_NAME = "max_memory_used"
 30 | 
 31 | # Create named groups for each metric and tag so that we can
 32 | # access the values from the search result by name
 33 | REPORT_LOG_REGEX = re.compile(
 34 |     r"REPORT\s+"
 35 |     + r"RequestId:\s+(?P<{}>[\w-]+)\s+".format(REQUEST_ID_FIELD_NAME)
 36 |     + r"Duration:\s+(?P<{}>[\d\.]+)\s+ms\s+".format(DURATION_METRIC_NAME)
 37 |     + r"Billed\s+Duration:\s+(?P<{}>[\d\.]+)\s+ms\s+".format(
 38 |         BILLED_DURATION_METRIC_NAME
 39 |     )
 40 |     + r"Memory\s+Size:\s+(?P<{}>\d+)\s+MB\s+".format(MEMORY_ALLOCATED_FIELD_NAME)
 41 |     + r"Max\s+Memory\s+Used:\s+(?P<{}>\d+)\s+MB".format(MAX_MEMORY_USED_METRIC_NAME)
 42 | )
 43 | 
 44 | METRICS_TO_PARSE_FROM_REPORT = [
 45 |     DURATION_METRIC_NAME,
 46 |     BILLED_DURATION_METRIC_NAME,
 47 |     MAX_MEMORY_USED_METRIC_NAME,
 48 | ]
 49 | 
 50 | # Multiply the duration metrics by 1/1000 to convert ms to seconds
 51 | METRIC_ADJUSTMENT_FACTORS = {
 52 |     DURATION_METRIC_NAME: 0.001,
 53 |     BILLED_DURATION_METRIC_NAME: 0.001,
 54 | }
 55 | 
 56 | 
 57 | resource_tagging_client = boto3.client("resourcegroupstaggingapi")
 58 | 
 59 | log = logging.getLogger()
 60 | 
 61 | 
 62 | try:
 63 |     from datadog_lambda.metric import lambda_stats
 64 | 
 65 |     DD_SUBMIT_ENHANCED_METRICS = True
 66 | except ImportError:
 67 |     log.debug(
 68 |         "Could not import from the Datadog Lambda layer so enhanced metrics won't be submitted. "
 69 |         "Add the Datadog Lambda layer to this function to submit enhanced metrics."
 70 |     )
 71 |     DD_SUBMIT_ENHANCED_METRICS = False
 72 | 
 73 | 
 74 | class LambdaTagsCache(object):
 75 |     def __init__(self, tags_ttl_seconds=TAGS_CACHE_TTL_SECONDS):
 76 |         self.tags_ttl_seconds = tags_ttl_seconds
 77 | 
 78 |         self.tags_by_arn = {}
 79 |         self.missing_arns = set()
 80 |         self.last_tags_fetch_time = 0
 81 | 
 82 |     def _refresh(self):
 83 |         """Populate the tags in the cache by making calls to GetResources
 84 |         """
 85 |         self.last_tags_fetch_time = time()
 86 | 
 87 |         # If the custom tag fetch env var is not set to true do not fetch
 88 |         if not should_fetch_custom_tags():
 89 |             log.debug(
 90 |                 "Not fetching custom tags because the env variable DD_FETCH_LAMBDA_TAGS is not set to true"
 91 |             )
 92 |             return
 93 | 
 94 |         self.tags_by_arn = build_tags_by_arn_cache()
 95 |         self.missing_arns -= set(self.tags_by_arn.keys())
 96 | 
 97 |     def _is_expired(self):
 98 |         """Returns bool for whether the tag fetch TTL has expired
 99 |         """
100 |         earliest_time_to_refetch_tags = (
101 |             self.last_tags_fetch_time + self.tags_ttl_seconds
102 |         )
103 |         return time() > earliest_time_to_refetch_tags
104 | 
105 |     def _should_refresh_if_missing_arn(self, resource_arn):
106 |         """ Determines whether to try and fetch a missing lambda arn.
107 |         We only refresh if we encounter an arn that we haven't seen
108 |         since the last refresh. This prevents refreshing on every call when
109 |         tags can't be found for an arn.
110 |         """
111 |         if resource_arn in self.missing_arns:
112 |             return False
113 |         return self.tags_by_arn.get(resource_arn) is None
114 | 
115 |     def get(self, resource_arn):
116 |         """Get the tags for the Lambda function from the cache
117 | 
118 |         Will refetch the tags if they are out of date, or a lambda arn is encountered
119 |         which isn't in the tag list
120 | 
121 |         Returns:
122 |             lambda_tags (str[]): the list of "key:value" Datadog tag strings
123 |         """
124 |         if self._is_expired() or self._should_refresh_if_missing_arn(resource_arn):
125 |             self._refresh()
126 | 
127 |         function_tags = self.tags_by_arn.get(resource_arn, None)
128 | 
129 |         if function_tags is None:
130 |             self.missing_arns.add(resource_arn)
131 |             return []
132 | 
133 |         return function_tags
134 | 
135 | 
136 | # Store the cache in the global scope so that it will be reused as long as
137 | # the log forwarder Lambda container is running
138 | account_lambda_tags_cache = LambdaTagsCache()
139 | 
140 | 
141 | class DatadogMetricPoint(object):
142 |     """Holds a datapoint's data so that it can be prepared for submission to DD
143 | 
144 |     Properties:
145 |         name (str): metric name, with namespace
146 |         value (int | float): the datapoint's value
147 | 
148 |     """
149 | 
150 |     def __init__(self, name, value, timestamp=None, tags=[]):
151 |         self.name = name
152 |         self.value = value
153 |         self.tags = tags
154 |         self.timestamp = timestamp
155 | 
156 |     def add_tags(self, tags):
157 |         """Add tags to this metric
158 | 
159 |         Args:
160 |             tags (str[]): list of tags to add to this metric
161 |         """
162 |         self.tags = self.tags + tags
163 | 
164 |     def set_timestamp(self, timestamp):
165 |         """Set the metric's timestamp
166 | 
167 |         Args:
168 |             timestamp (int): Unix timestamp of this metric
169 |         """
170 |         self.timestamp = timestamp
171 | 
172 |     def submit_to_dd(self):
173 |         """Submit this metric to the Datadog API
174 |         """
175 |         timestamp = self.timestamp
176 |         if not timestamp:
177 |             timestamp = time()
178 | 
179 |         log.debug("Submitting metric {} {} {}".format(self.name, self.value, self.tags))
180 |         lambda_stats.distribution(
181 |             self.name, self.value, timestamp=timestamp, tags=self.tags
182 |         )
183 | 
184 | 
185 | def should_fetch_custom_tags():
186 |     """Checks the env var to determine if the customer has opted-in to fetching custom tags
187 |     """
188 |     return os.environ.get("DD_FETCH_LAMBDA_TAGS", "false").lower() == "true"
189 | 
190 | 
191 | _other_chars = r"\w:\-\.\/"
192 | Sanitize = re.compile(r"[^%s]" % _other_chars, re.UNICODE).sub
193 | Dedupe = re.compile(r"_+", re.UNICODE).sub
194 | FixInit = re.compile(r"^[_\d]*", re.UNICODE).sub
195 | 
196 | 
197 | def sanitize_aws_tag_string(tag, remove_colons=False):
198 |     """Convert characters banned from DD but allowed in AWS tags to underscores
199 |     """
200 |     global Sanitize, Dedupe, FixInit
201 | 
202 |     # 1. Replaces colons with _
203 |     # 2. Convert to all lowercase unicode string
204 |     # 3. Convert bad characters to underscores
205 |     # 4. Dedupe contiguous underscores
206 |     # 5. Remove initial underscores/digits such that the string
207 |     #    starts with an alpha char
208 |     #    FIXME: tag normalization incorrectly supports tags starting
209 |     #    with a ':', but this behavior should be phased out in future
210 |     #    as it results in unqueryable data.  See dogweb/#11193
211 |     # 6. Truncate to 200 characters
212 |     # 7. Strip trailing underscores
213 | 
214 |     if len(tag) == 0:
215 |         # if tag is empty, nothing to do
216 |         return tag
217 | 
218 |     if remove_colons:
219 |         tag = tag.replace(":", "_")
220 |     tag = Dedupe(u"_", Sanitize(u"_", tag.lower()))
221 |     first_char = tag[0]
222 |     if first_char == u"_" or u"0" <= first_char <= "9":
223 |         tag = FixInit(u"", tag)
224 |     tag = tag[0:200].rstrip("_")
225 |     return tag
226 | 
227 | 
228 | def get_dd_tag_string_from_aws_dict(aws_key_value_tag_dict):
229 |     """Converts the AWS dict tag format to the dd key:value string format
230 | 
231 |     Args:
232 |         aws_key_value_tag_dict (dict): the dict the GetResources endpoint returns for a tag
233 |             ex: { "Key": "creator", "Value": "swf"}
234 | 
235 |     Returns:
236 |         key:value colon-separated string built from the dict
237 |             ex: "creator:swf"
238 |     """
239 |     key = sanitize_aws_tag_string(aws_key_value_tag_dict["Key"], remove_colons=True)
240 |     value = sanitize_aws_tag_string(aws_key_value_tag_dict.get("Value"))
241 |     # Value is optional in DD and AWS
242 |     if not value:
243 |         return key
244 |     return "{}:{}".format(key, value)
245 | 
246 | 
247 | def parse_get_resources_response_for_tags_by_arn(get_resources_page):
248 |     """Parses a page of GetResources response for the mapping from ARN to tags
249 | 
250 |     Args:
251 |         get_resources_page (dict<str, dict<str, dict | str>[]>): one page of the GetResources response.
252 |             Partial example:
253 |                 {"ResourceTagMappingList": [{
254 |                     'ResourceARN': 'arn:aws:lambda:us-east-1:123497598159:function:my-test-lambda',
255 |                     'Tags': [{'Key': 'stage', 'Value': 'dev'}, {'Key': 'team', 'Value': 'serverless'}]
256 |                 }]}
257 | 
258 |     Returns:
259 |         tags_by_arn (dict<str, str[]>): Lambda tag lists keyed by ARN
260 |     """
261 |     tags_by_arn = defaultdict(list)
262 | 
263 |     aws_resouce_tag_mappings = get_resources_page["ResourceTagMappingList"]
264 |     for aws_resource_tag_mapping in aws_resouce_tag_mappings:
265 |         function_arn = aws_resource_tag_mapping["ResourceARN"]
266 |         raw_aws_tags = aws_resource_tag_mapping["Tags"]
267 |         tags = map(get_dd_tag_string_from_aws_dict, raw_aws_tags)
268 | 
269 |         tags_by_arn[function_arn] += tags
270 | 
271 |     return tags_by_arn
272 | 
273 | 
274 | def build_tags_by_arn_cache():
275 |     """Makes API calls to GetResources to get the live tags of the account's Lambda functions
276 | 
277 |     Returns an empty dict instead of fetching custom tags if the tag fetch env variable is not set to true
278 | 
279 |     Returns:
280 |         tags_by_arn_cache (dict<str, str[]>): each Lambda's tags in a dict keyed by ARN
281 |     """
282 |     tags_by_arn_cache = {}
283 |     get_resources_paginator = resource_tagging_client.get_paginator("get_resources")
284 | 
285 |     try:
286 |         for page in get_resources_paginator.paginate(
287 |             ResourceTypeFilters=[GET_RESOURCES_LAMBDA_FILTER], ResourcesPerPage=100
288 |         ):
289 |             lambda_stats.distribution(
290 |                 "{}.get_resources_api_calls".format(ENHANCED_METRICS_NAMESPACE_PREFIX),
291 |                 1,
292 |             )
293 |             page_tags_by_arn = parse_get_resources_response_for_tags_by_arn(page)
294 |             tags_by_arn_cache.update(page_tags_by_arn)
295 | 
296 |     except ClientError:
297 |         log.exception(
298 |             "Encountered a ClientError when trying to fetch tags. You may need to give "
299 |             "this Lambda's role the 'tag:GetResources' permission"
300 |         )
301 | 
302 |     log.debug(
303 |         "Built this tags cache from GetResources API calls: %s", tags_by_arn_cache
304 |     )
305 | 
306 |     return tags_by_arn_cache
307 | 
308 | 
309 | def parse_and_submit_enhanced_metrics(logs):
310 |     """Parses enhanced metrics from REPORT logs and submits them to DD with tags
311 | 
312 |     Args:
313 |         logs (dict<str, str | dict | int>[]): the logs parsed from the event in the split method
314 |             See docstring below for an example.
315 |     """
316 |     # If the Lambda layer is not present we can't submit enhanced metrics
317 |     if not DD_SUBMIT_ENHANCED_METRICS:
318 |         return
319 | 
320 |     for log in logs:
321 |         try:
322 |             enhanced_metrics = generate_enhanced_lambda_metrics(
323 |                 log, account_lambda_tags_cache
324 |             )
325 |             for enhanced_metric in enhanced_metrics:
326 |                 enhanced_metric.submit_to_dd()
327 |         except Exception:
328 |             log.exception(
329 |                 "Encountered an error while trying to parse and submit enhanced metrics for log %s",
330 |                 log,
331 |             )
332 | 
333 | 
334 | def generate_enhanced_lambda_metrics(log, tags_cache):
335 |     """Parses a Lambda log for enhanced Lambda metrics and tags
336 | 
337 |     Args:
338 |         log (dict<str, str | dict | int>): a log parsed from the event in the split method
339 |             Ex: {
340 |                     "id": "34988208851106313984209006125707332605649155257376768001",
341 |                     "timestamp": 1568925546641,
342 |                     "message": "END RequestId: 2f676573-c16b-4207-993a-51fb960d73e2\\n",
343 |                     "aws": {
344 |                         "awslogs": {
345 |                             "logGroup": "/aws/lambda/function_log_generator",
346 |                             "logStream": "2019/09/19/[$LATEST]0225597e48f74a659916f0e482df5b92",
347 |                             "owner": "172597598159"
348 |                         },
349 |                         "function_version": "$LATEST",
350 |                         "invoked_function_arn": "arn:aws:lambda:us-east-1:172597598159:function:collect_logs_datadog_demo"
351 |                     },
352 |                     "lambda": {
353 |                         "arn": "arn:aws:lambda:us-east-1:172597598159:function:function_log_generator"
354 |                     },
355 |                     "ddsourcecategory": "aws",
356 |                     "ddtags": "env:demo,python_version:3.6,role:lambda,forwardername:collect_logs_datadog_demo,memorysize:128,forwarder_version:2.0.0,functionname:function_log_generator,env:none",
357 |                     "ddsource": "lambda",
358 |                     "service": "function_log_generator",
359 |                     "host": "arn:aws:lambda:us-east-1:172597598159:function:function_log_generator"
360 |                 }
361 |         tags_cache (LambdaTagsCache): used to apply the Lambda's custom tags to the metrics
362 | 
363 |     Returns:
364 |         DatadogMetricPoint[], where each metric has all of its tags
365 |     """
366 |     log_function_arn = log.get("lambda", {}).get("arn")
367 |     log_message = log.get("message")
368 |     timestamp = log.get("timestamp")
369 | 
370 |     # If the log dict is missing any of this data it's not a Lambda REPORT log and we move on
371 |     if not all(
372 |         (log_function_arn, log_message, timestamp, log_message.startswith("REPORT"))
373 |     ):
374 |         return []
375 | 
376 |     parsed_metrics = parse_metrics_from_report_log(log_message)
377 |     if not parsed_metrics:
378 |         return []
379 | 
380 |     # Add the tags from ARN, custom tags cache, and env var
381 |     tags_from_arn = parse_lambda_tags_from_arn(log_function_arn)
382 |     lambda_custom_tags = tags_cache.get(log_function_arn)
383 | 
384 |     for parsed_metric in parsed_metrics:
385 |         parsed_metric.add_tags(tags_from_arn + lambda_custom_tags)
386 |         # Submit the metric with the timestamp of the log event
387 |         parsed_metric.set_timestamp(int(timestamp))
388 | 
389 |     return parsed_metrics
390 | 
391 | 
392 | def parse_lambda_tags_from_arn(arn):
393 |     """Generate the list of lambda tags based on the data in the arn
394 | 
395 |     Args:
396 |         arn (str): Lambda ARN.
397 |             ex: arn:aws:lambda:us-east-1:172597598159:function:my-lambda[:optional-version]
398 |     """
399 |     # Cap the number of times to split
400 |     split_arn = arn.split(":")
401 | 
402 |     # If ARN includes version / alias at the end, drop it
403 |     if len(split_arn) > 7:
404 |         split_arn = split_arn[:7]
405 | 
406 |     _, _, _, region, account_id, _, function_name = split_arn
407 | 
408 |     return [
409 |         "region:{}".format(region),
410 |         "account_id:{}".format(account_id),
411 |         # Include the aws_account tag to match the aws.lambda CloudWatch metrics
412 |         "aws_account:{}".format(account_id),
413 |         "functionname:{}".format(function_name),
414 |     ]
415 | 
416 | 
417 | def parse_metrics_from_report_log(report_log_line):
418 |     """Parses and returns metrics from the REPORT Lambda log
419 | 
420 |     Args:
421 |         report_log_line (str): The REPORT log generated by Lambda
422 |         EX: "REPORT RequestId: 814ba7cb-071e-4181-9a09-fa41db5bccad	Duration: 1711.87 ms	\
423 |             Billed Duration: 1800 ms	Memory Size: 128 MB	Max Memory Used: 98 MB	\
424 |             XRAY TraceId: 1-5d83c0ad-b8eb33a0b1de97d804fac890	SegmentId: 31255c3b19bd3637	Sampled: true"
425 | 
426 |     Returns:
427 |         metrics - DatadogMetricPoint[]
428 |     """
429 |     regex_match = REPORT_LOG_REGEX.search(report_log_line)
430 | 
431 |     if not regex_match:
432 |         return []
433 | 
434 |     metrics = []
435 | 
436 |     for metric_name in METRICS_TO_PARSE_FROM_REPORT:
437 |         metric_point_value = float(regex_match.group(metric_name))
438 |         # Multiply the duration metrics by 1/1000 to convert ms to seconds
439 |         if metric_name in METRIC_ADJUSTMENT_FACTORS:
440 |             metric_point_value *= METRIC_ADJUSTMENT_FACTORS[metric_name]
441 | 
442 |         dd_metric = DatadogMetricPoint(
443 |             "{}.{}".format(ENHANCED_METRICS_NAMESPACE_PREFIX, metric_name),
444 |             metric_point_value,
445 |         )
446 |         metrics.append(dd_metric)
447 | 
448 |     estimated_cost_metric_point = DatadogMetricPoint(
449 |         "{}.{}".format(ENHANCED_METRICS_NAMESPACE_PREFIX, ESTIMATED_COST_METRIC_NAME),
450 |         calculate_estimated_cost(
451 |             float(regex_match.group(BILLED_DURATION_METRIC_NAME)),
452 |             float(regex_match.group(MEMORY_ALLOCATED_FIELD_NAME)),
453 |         ),
454 |     )
455 |     metrics.append(estimated_cost_metric_point)
456 | 
457 |     return metrics
458 | 
459 | 
460 | def calculate_estimated_cost(billed_duration_ms, memory_allocated):
461 |     """Returns the estimated cost in USD of a Lambda invocation
462 | 
463 |     Args:
464 |         billed_duration (float | int): number of milliseconds this invocation is billed for
465 |         memory_allocated (float | int): amount of memory in MB allocated to the function execution
466 | 
467 |     See https://aws.amazon.com/lambda/pricing/ for latest pricing
468 |     """
469 |     # Divide milliseconds by 1000 to get seconds
470 |     gb_seconds = (billed_duration_ms / 1000.0) * (memory_allocated / 1024.0)
471 | 
472 |     return BASE_LAMBDA_INVOCATION_PRICE + gb_seconds * LAMBDA_PRICE_PER_GB_SECOND
473 | 
474 | 
475 | def get_enriched_lambda_log_tags(log):
476 |     """ Retrieves extra tags from lambda, either read from the function arn, or by fetching lambda tags from the function itself.
477 | 
478 |     Args:
479 |         log (dict<str, str | dict | int>): a log parsed from the event in the split method
480 |     """
481 |     log_function_arn = log.get("lambda", {}).get("arn")
482 |     if not log_function_arn:
483 |         return []
484 |     tags_from_arn = parse_lambda_tags_from_arn(log_function_arn)
485 |     lambda_custom_tags = account_lambda_tags_cache.get(log_function_arn)
486 |     # Combine and dedup tags
487 |     tags = list(set(tags_from_arn + lambda_custom_tags))
488 |     return tags
489 | 


--------------------------------------------------------------------------------
/src/fingerprint.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def fingerprint(query):
 4 |     query = query.strip().lower()
 5 | 
 6 |     query = re.sub(r'\\["\']', '', query)
 7 |     query = re.sub(r'[ \n\t\r\f]+', ' ', query)
 8 |     query = re.sub(r'\bnull\b', '?', query)
 9 |     query = re.sub(r'\b\d+\b', '?', query)
10 | 
11 |     # "str" => ?
12 |     query = re.sub(r'".*?"', '?', query)
13 |     # 'str' => ?
14 |     query = re.sub(r"'.*?'", '?', query)
15 | 
16 |     query = re.sub(r'\b(in|values)([\s,]*\([\s?,]*\))+', '\\1(?+)', query)
17 |     query = re.sub(r'\blimit \?(, ?\?| offset \?)?', 'limit ?', query)
18 | 
19 |     return query
20 | 


--------------------------------------------------------------------------------
/src/lambda_function.py:
--------------------------------------------------------------------------------
   1 | # Unless explicitly stated otherwise all files in this repository are licensed
   2 | # under the Apache License Version 2.0.
   3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
   4 | # Copyright 2018 Datadog, Inc.
   5 | 
   6 | from __future__ import print_function
   7 | 
   8 | import base64
   9 | import gzip
  10 | import json
  11 | import os
  12 | 
  13 | import boto3
  14 | import itertools
  15 | import re
  16 | import six.moves.urllib as urllib  # for for Python 2.7 urllib.unquote_plus
  17 | import socket
  18 | import ssl
  19 | import logging
  20 | from io import BytesIO, BufferedReader
  21 | import time
  22 | 
  23 | log = logging.getLogger()
  24 | log.setLevel(logging.getLevelName(os.environ.get("DD_LOG_LEVEL", "INFO").upper()))
  25 | 
  26 | try:
  27 |     import requests
  28 | except ImportError:
  29 |     log.error(
  30 |         "Could not import the 'requests' package, please ensure the Datadog "
  31 |         "Lambda Layer is installed. https://dtdg.co/forwarder-layer"
  32 |     )
  33 |     # Fallback to the botocore vendored version of requests, while ensuring
  34 |     # customers have the Datadog Lambda Layer installed. The vendored version
  35 |     # of requests is removed in botocore 1.13.x.
  36 |     from botocore.vendored import requests
  37 | 
  38 | try:
  39 |     from enhanced_lambda_metrics import (
  40 |         get_enriched_lambda_log_tags,
  41 |         parse_and_submit_enhanced_metrics,
  42 |     )
  43 | 
  44 |     IS_ENHANCED_METRICS_FILE_PRESENT = True
  45 | except ImportError:
  46 |     IS_ENHANCED_METRICS_FILE_PRESENT = False
  47 |     log.warn(
  48 |         "Could not import from enhanced_lambda_metrics so enhanced metrics "
  49 |         "will not be submitted. Ensure you've included the enhanced_lambda_metrics "
  50 |         "file in your Lambda project."
  51 |     )
  52 | finally:
  53 |     log.debug(f"IS_ENHANCED_METRICS_FILE_PRESENT: {IS_ENHANCED_METRICS_FILE_PRESENT}")
  54 | 
  55 | try:
  56 |     # Datadog Lambda layer is required to forward metrics
  57 |     from datadog_lambda.wrapper import datadog_lambda_wrapper
  58 |     from datadog_lambda.metric import lambda_stats
  59 | 
  60 |     DD_FORWARD_METRIC = True
  61 | except ImportError:
  62 |     log.debug(
  63 |         "Could not import from the Datadog Lambda layer, metrics can't be forwarded"
  64 |     )
  65 |     # For backward-compatibility
  66 |     DD_FORWARD_METRIC = False
  67 | finally:
  68 |     log.debug(f"DD_FORWARD_METRIC: {DD_FORWARD_METRIC}")
  69 | 
  70 | try:
  71 |     # Datadog Trace Layer is required to forward traces
  72 |     from trace_forwarder.connection import TraceConnection
  73 | 
  74 |     DD_FORWARD_TRACES = True
  75 | except ImportError:
  76 |     # For backward-compatibility
  77 |     DD_FORWARD_TRACES = False
  78 | finally:
  79 |     log.debug(f"DD_FORWARD_TRACES: {DD_FORWARD_TRACES}")
  80 | 
  81 | 
  82 | def get_env_var(envvar, default, boolean=False):
  83 |     """
  84 |         Return the value of the given environment variable with debug logging.
  85 |         When boolean=True, parse the value as a boolean case-insensitively.
  86 |     """
  87 |     value = os.getenv(envvar, default=default)
  88 |     if boolean:
  89 |         value = value.lower() == "true"
  90 |     log.debug(f"{envvar}: {value}")
  91 |     return value
  92 | 
  93 | 
  94 | #####################################
  95 | ############# PARAMETERS ############
  96 | #####################################
  97 | 
  98 | ## @param DD_API_KEY - String - conditional - default: none
  99 | ## The Datadog API key associated with your Datadog Account
 100 | ## It can be found here:
 101 | ##
 102 | ##   * Datadog US Site: https://app.datadoghq.com/account/settings#api
 103 | ##   * Datadog EU Site: https://app.datadoghq.eu/account/settings#api
 104 | ##
 105 | ## Must be set if one of the following is not set: DD_API_KEY_SECRET_ARN, DD_API_KEY_SSM_NAME, DD_KMS_API_KEY
 106 | #
 107 | DD_API_KEY = "<YOUR_DATADOG_API_KEY>"
 108 | 
 109 | ## @param DD_API_KEY_SECRET_ARN - String - optional - default: none
 110 | ## ARN of Datadog API key stored in AWS Secrets Manager
 111 | ##
 112 | ## Supercedes: DD_API_KEY_SSM_NAME, DD_KMS_API_KEY, DD_API_KEY
 113 | 
 114 | ## @param DD_API_KEY_SSM_NAME - String - optional - default: none
 115 | ## Name of parameter containing Datadog API key in AWS SSM Parameter Store
 116 | ##
 117 | ## Supercedes: DD_KMS_API_KEY, DD_API_KEY
 118 | 
 119 | ## @param DD_KMS_API_KEY - String - optional - default: none
 120 | ## AWS KMS encrypted Datadog API key
 121 | ##
 122 | ## Supercedes: DD_API_KEY
 123 | 
 124 | ## @param DD_FORWARD_LOG - boolean - optional - default: true
 125 | ## Set this variable to `False` to disable log forwarding.
 126 | ## E.g., when you only want to forward metrics from logs.
 127 | #
 128 | DD_FORWARD_LOG = get_env_var("DD_FORWARD_LOG", "true", boolean=True)
 129 | 
 130 | ## @param DD_USE_TCP - boolean - optional -default: false
 131 | ## Change this value to `true` to send your logs and metrics using the TCP network client
 132 | ## By default, it uses the HTTP client.
 133 | #
 134 | DD_USE_TCP = get_env_var("DD_USE_TCP", "false", boolean=True)
 135 | 
 136 | ## @param DD_USE_COMPRESSION - boolean - optional -default: true
 137 | ## Only valid when sending logs over HTTP
 138 | ## Change this value to `false` to send your logs without any compression applied
 139 | ## By default, compression is enabled.
 140 | #
 141 | DD_USE_COMPRESSION = get_env_var("DD_USE_COMPRESSION", "true", boolean=True)
 142 | 
 143 | ## @param DD_USE_COMPRESSION - integer - optional -default: 6
 144 | ## Change this value to set the compression level.
 145 | ## Values range from 0 (no compression) to 9 (best compression).
 146 | ## By default, compression is set to level 6.
 147 | #
 148 | DD_COMPRESSION_LEVEL = int(os.getenv("DD_COMPRESSION_LEVEL", 6))
 149 | 
 150 | ## @param DD_USE_SSL - boolean - optional -default: false
 151 | ## Change this value to `true` to disable SSL
 152 | ## Useful when you are forwarding your logs to a proxy.
 153 | #
 154 | DD_NO_SSL = get_env_var("DD_NO_SSL", "false", boolean=True)
 155 | 
 156 | ## @param DD_SKIP_SSL_VALIDATION - boolean - optional -default: false
 157 | ## Disable SSL certificate validation when forwarding logs via HTTP.
 158 | #
 159 | DD_SKIP_SSL_VALIDATION = get_env_var("DD_SKIP_SSL_VALIDATION", "false", boolean=True)
 160 | 
 161 | ## @param DD_SITE - String - optional -default: datadoghq.com
 162 | ## Define the Datadog Site to send your logs and metrics to.
 163 | ## Set it to `datadoghq.eu` to send your logs and metrics to Datadog EU site.
 164 | #
 165 | DD_SITE = get_env_var("DD_SITE", default="datadoghq.com")
 166 | 
 167 | ## @param DD_TAGS - list of comma separated strings - optional -default: none
 168 | ## Pass custom tags as environment variable or through this variable.
 169 | ## Ensure your tags are a comma separated list of strings with no trailing comma in the envvar!
 170 | #
 171 | DD_TAGS = get_env_var("DD_TAGS", "")
 172 | 
 173 | ## @param DD_API_URL - Url to use for  validating the the api key. Used for validating api key.
 174 | DD_API_URL = get_env_var("DD_API_URL", default="https://api.{}".format(DD_SITE))
 175 | log.debug(f"DD_API_URL: {DD_API_URL}")
 176 | 
 177 | ## @param DD_TRACE_INTAKE_URL - Url to use for  validating the the api key. Used for validating api key.
 178 | DD_TRACE_INTAKE_URL = get_env_var(
 179 |     "DD_TRACE_INTAKE_URL", default="https://trace.agent.{}".format(DD_SITE)
 180 | )
 181 | 
 182 | if DD_USE_TCP:
 183 |     DD_URL = get_env_var("DD_URL", default="lambda-intake.logs." + DD_SITE)
 184 |     try:
 185 |         if "DD_SITE" in os.environ and DD_SITE == "datadoghq.eu":
 186 |             DD_PORT = int(get_env_var("DD_PORT", default="443"))
 187 |         else:
 188 |             DD_PORT = int(get_env_var("DD_PORT", default="10516"))
 189 |     except Exception:
 190 |         DD_PORT = 10516
 191 | else:
 192 |     DD_URL = get_env_var("DD_URL", default="lambda-http-intake.logs." + DD_SITE)
 193 |     DD_PORT = int(get_env_var("DD_PORT", default="443"))
 194 | 
 195 | 
 196 | class ScrubbingRuleConfig(object):
 197 |     def __init__(self, name, pattern, placeholder):
 198 |         self.name = name
 199 |         self.pattern = pattern
 200 |         self.placeholder = placeholder
 201 | 
 202 | 
 203 | # Scrubbing sensitive data
 204 | # Option to redact all pattern that looks like an ip address / email address / custom pattern
 205 | SCRUBBING_RULE_CONFIGS = [
 206 |     ScrubbingRuleConfig(
 207 |         "REDACT_IP", "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", "xxx.xxx.xxx.xxx"
 208 |     ),
 209 |     ScrubbingRuleConfig(
 210 |         "REDACT_EMAIL",
 211 |         "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
 212 |         "xxxxx@xxxxx.com",
 213 |     ),
 214 |     ScrubbingRuleConfig(
 215 |         "DD_SCRUBBING_RULE",
 216 |         get_env_var("DD_SCRUBBING_RULE", default=None),
 217 |         get_env_var("DD_SCRUBBING_RULE_REPLACEMENT", default="xxxxx"),
 218 |     ),
 219 | ]
 220 | 
 221 | 
 222 | # Use for include, exclude, and scrubbing rules
 223 | def compileRegex(rule, pattern):
 224 |     if pattern is not None:
 225 |         if pattern == "":
 226 |             # If pattern is an empty string, raise exception
 227 |             raise Exception(
 228 |                 "No pattern provided:\nAdd pattern or remove {} environment variable".format(
 229 |                     rule
 230 |                 )
 231 |             )
 232 |         try:
 233 |             return re.compile(pattern)
 234 |         except Exception:
 235 |             raise Exception(
 236 |                 "could not compile {} regex with pattern: {}".format(rule, pattern)
 237 |             )
 238 | 
 239 | 
 240 | # Filtering logs
 241 | # Option to include or exclude logs based on a pattern match
 242 | INCLUDE_AT_MATCH = get_env_var("INCLUDE_AT_MATCH", default=None)
 243 | include_regex = compileRegex("INCLUDE_AT_MATCH", INCLUDE_AT_MATCH)
 244 | 
 245 | EXCLUDE_AT_MATCH = get_env_var("EXCLUDE_AT_MATCH", default=None)
 246 | exclude_regex = compileRegex("EXCLUDE_AT_MATCH", EXCLUDE_AT_MATCH)
 247 | 
 248 | if "DD_API_KEY_SECRET_ARN" in os.environ:
 249 |     SECRET_ARN = os.environ["DD_API_KEY_SECRET_ARN"]
 250 |     DD_API_KEY = boto3.client("secretsmanager").get_secret_value(SecretId=SECRET_ARN)[
 251 |         "SecretString"
 252 |     ]
 253 | elif "DD_API_KEY_SSM_NAME" in os.environ:
 254 |     SECRET_NAME = os.environ["DD_API_KEY_SSM_NAME"]
 255 |     DD_API_KEY = boto3.client("ssm").get_parameter(
 256 |         Name=SECRET_NAME, WithDecryption=True
 257 |     )["Parameter"]["Value"]
 258 | elif "DD_KMS_API_KEY" in os.environ:
 259 |     ENCRYPTED = os.environ["DD_KMS_API_KEY"]
 260 |     DD_API_KEY = boto3.client("kms").decrypt(
 261 |         CiphertextBlob=base64.b64decode(ENCRYPTED)
 262 |     )["Plaintext"]
 263 |     if type(DD_API_KEY) is bytes:
 264 |         DD_API_KEY = DD_API_KEY.decode("utf-8")
 265 | elif "DD_API_KEY" in os.environ:
 266 |     DD_API_KEY = os.environ["DD_API_KEY"]
 267 | 
 268 | # Strip any trailing and leading whitespace from the API key
 269 | DD_API_KEY = DD_API_KEY.strip()
 270 | os.environ["DD_API_KEY"] = DD_API_KEY
 271 | 
 272 | # Force the layer to use the exact same API key as the forwarder
 273 | if DD_FORWARD_METRIC:
 274 |     from datadog import api
 275 | 
 276 |     api._api_key = DD_API_KEY
 277 | 
 278 | # DD_API_KEY must be set
 279 | if DD_API_KEY == "<YOUR_DATADOG_API_KEY>" or DD_API_KEY == "":
 280 |     raise Exception("Missing Datadog API key")
 281 | # Check if the API key is the correct number of characters
 282 | if len(DD_API_KEY) != 32:
 283 |     raise Exception(
 284 |         "The API key is not the expected length. "
 285 |         "Please confirm that your API key is correct"
 286 |     )
 287 | # Validate the API key
 288 | validation_res = requests.get(
 289 |     "{}/api/v1/validate?api_key={}".format(DD_API_URL, DD_API_KEY)
 290 | )
 291 | if not validation_res.ok:
 292 |     raise Exception("The API key is not valid.")
 293 | 
 294 | trace_connection = None
 295 | if DD_FORWARD_TRACES:
 296 |     trace_connection = TraceConnection(DD_TRACE_INTAKE_URL, DD_API_KEY)
 297 | 
 298 | # DD_MULTILINE_LOG_REGEX_PATTERN: Multiline Log Regular Expression Pattern
 299 | DD_MULTILINE_LOG_REGEX_PATTERN = get_env_var(
 300 |     "DD_MULTILINE_LOG_REGEX_PATTERN", default=None
 301 | )
 302 | if DD_MULTILINE_LOG_REGEX_PATTERN:
 303 |     try:
 304 |         multiline_regex = re.compile(
 305 |             "[\n\r\f]+(?={})".format(DD_MULTILINE_LOG_REGEX_PATTERN)
 306 |         )
 307 |     except Exception:
 308 |         raise Exception(
 309 |             "could not compile multiline regex with pattern: {}".format(
 310 |                 DD_MULTILINE_LOG_REGEX_PATTERN
 311 |             )
 312 |         )
 313 |     multiline_regex_start_pattern = re.compile(
 314 |         "^{}".format(DD_MULTILINE_LOG_REGEX_PATTERN)
 315 |     )
 316 | 
 317 | rds_regex = re.compile("/aws/rds/(instance|cluster)/(?P<host>[^/]+)/(?P<name>[^/]+)")
 318 | 
 319 | DD_SOURCE = "ddsource"
 320 | DD_CUSTOM_TAGS = "ddtags"
 321 | DD_SERVICE = "service"
 322 | DD_HOST = "host"
 323 | DD_FORWARDER_VERSION = "3.9.0"
 324 | 
 325 | 
 326 | class RetriableException(Exception):
 327 |     pass
 328 | 
 329 | 
 330 | class ScrubbingException(Exception):
 331 |     pass
 332 | 
 333 | 
 334 | class DatadogClient(object):
 335 |     """
 336 |     Client that implements a exponential retrying logic to send a batch of logs.
 337 |     """
 338 | 
 339 |     def __init__(self, client, max_backoff=30):
 340 |         self._client = client
 341 |         self._max_backoff = max_backoff
 342 | 
 343 |     def send(self, logs):
 344 |         backoff = 1
 345 |         while True:
 346 |             try:
 347 |                 self._client.send(logs)
 348 |                 return
 349 |             except RetriableException:
 350 |                 time.sleep(backoff)
 351 |                 if backoff < self._max_backoff:
 352 |                     backoff *= 2
 353 |                 continue
 354 | 
 355 |     def __enter__(self):
 356 |         self._client.__enter__()
 357 |         return self
 358 | 
 359 |     def __exit__(self, ex_type, ex_value, traceback):
 360 |         self._client.__exit__(ex_type, ex_value, traceback)
 361 | 
 362 | 
 363 | class DatadogTCPClient(object):
 364 |     """
 365 |     Client that sends a batch of logs over TCP.
 366 |     """
 367 | 
 368 |     def __init__(self, host, port, no_ssl, api_key, scrubber):
 369 |         self.host = host
 370 |         self.port = port
 371 |         self._use_ssl = not no_ssl
 372 |         self._api_key = api_key
 373 |         self._scrubber = scrubber
 374 |         self._sock = None
 375 | 
 376 |     def _connect(self):
 377 |         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 378 |         if self._use_ssl:
 379 |             sock = ssl.create_default_context().wrap_socket(
 380 |                 sock, server_hostname=self.host
 381 |             )
 382 |         sock.connect((self.host, self.port))
 383 |         self._sock = sock
 384 | 
 385 |     def _close(self):
 386 |         if self._sock:
 387 |             self._sock.close()
 388 | 
 389 |     def _reset(self):
 390 |         self._close()
 391 |         self._connect()
 392 | 
 393 |     def send(self, logs):
 394 |         try:
 395 |             frame = self._scrubber.scrub(
 396 |                 "".join(["{} {}\n".format(self._api_key, log) for log in logs])
 397 |             )
 398 |             self._sock.sendall(frame.encode("UTF-8"))
 399 |         except ScrubbingException:
 400 |             raise Exception("could not scrub the payload")
 401 |         except Exception:
 402 |             # most likely a network error, reset the connection
 403 |             self._reset()
 404 |             raise RetriableException()
 405 | 
 406 |     def __enter__(self):
 407 |         self._connect()
 408 |         return self
 409 | 
 410 |     def __exit__(self, ex_type, ex_value, traceback):
 411 |         self._close()
 412 | 
 413 | 
 414 | class DatadogHTTPClient(object):
 415 |     """
 416 |     Client that sends a batch of logs over HTTP.
 417 |     """
 418 | 
 419 |     _POST = "POST"
 420 |     if DD_USE_COMPRESSION:
 421 |         _HEADERS = {"Content-type": "application/json", "Content-Encoding": "gzip"}
 422 |     else:
 423 |         _HEADERS = {"Content-type": "application/json"}
 424 | 
 425 |     def __init__(
 426 |         self, host, port, no_ssl, skip_ssl_validation, api_key, scrubber, timeout=10
 427 |     ):
 428 |         protocol = "http" if no_ssl else "https"
 429 |         self._url = "{}://{}:{}/v1/input/{}".format(protocol, host, port, api_key)
 430 |         self._scrubber = scrubber
 431 |         self._timeout = timeout
 432 |         self._session = None
 433 |         self._ssl_validation = not skip_ssl_validation
 434 | 
 435 |     def _connect(self):
 436 |         self._session = requests.Session()
 437 |         self._session.headers.update(self._HEADERS)
 438 | 
 439 |     def _close(self):
 440 |         self._session.close()
 441 | 
 442 |     def send(self, logs):
 443 |         """
 444 |         Sends a batch of log, only retry on server and network errors.
 445 |         """
 446 |         try:
 447 |             data = self._scrubber.scrub("[{}]".format(",".join(logs)))
 448 |         except ScrubbingException:
 449 |             raise Exception("could not scrub the payload")
 450 |         if DD_USE_COMPRESSION:
 451 |             data = compress_logs(data, DD_COMPRESSION_LEVEL)
 452 |         try:
 453 |             resp = self._session.post(
 454 |                 self._url, data, timeout=self._timeout, verify=self._ssl_validation
 455 |             )
 456 |         except Exception:
 457 |             # most likely a network error
 458 |             raise RetriableException()
 459 |         if resp.status_code >= 500:
 460 |             # server error
 461 |             raise RetriableException()
 462 |         elif resp.status_code >= 400:
 463 |             # client error
 464 |             raise Exception(
 465 |                 "client error, status: {}, reason {}".format(
 466 |                     resp.status_code, resp.reason
 467 |                 )
 468 |             )
 469 |         else:
 470 |             # success
 471 |             return
 472 | 
 473 |     def __enter__(self):
 474 |         self._connect()
 475 |         return self
 476 | 
 477 |     def __exit__(self, ex_type, ex_value, traceback):
 478 |         self._close()
 479 | 
 480 | 
 481 | class DatadogBatcher(object):
 482 |     def __init__(self, max_log_size_bytes, max_size_bytes, max_size_count):
 483 |         self._max_log_size_bytes = max_log_size_bytes
 484 |         self._max_size_bytes = max_size_bytes
 485 |         self._max_size_count = max_size_count
 486 | 
 487 |     def _sizeof_bytes(self, log):
 488 |         return len(log.encode("UTF-8"))
 489 | 
 490 |     def batch(self, logs):
 491 |         """
 492 |         Returns an array of batches.
 493 |         Each batch contains at most max_size_count logs and
 494 |         is not strictly greater than max_size_bytes.
 495 |         All logs strictly greater than max_log_size_bytes are dropped.
 496 |         """
 497 |         batches = []
 498 |         batch = []
 499 |         size_bytes = 0
 500 |         size_count = 0
 501 |         for log in logs:
 502 |             log_size_bytes = self._sizeof_bytes(log)
 503 |             if size_count > 0 and (
 504 |                 size_count >= self._max_size_count
 505 |                 or size_bytes + log_size_bytes > self._max_size_bytes
 506 |             ):
 507 |                 batches.append(batch)
 508 |                 batch = []
 509 |                 size_bytes = 0
 510 |                 size_count = 0
 511 |             # all logs exceeding max_log_size_bytes are dropped here
 512 |             if log_size_bytes <= self._max_log_size_bytes:
 513 |                 batch.append(log)
 514 |                 size_bytes += log_size_bytes
 515 |                 size_count += 1
 516 |         if size_count > 0:
 517 |             batches.append(batch)
 518 |         return batches
 519 | 
 520 | 
 521 | def compress_logs(batch, level):
 522 |     if level < 0:
 523 |         compression_level = 0
 524 |     elif level > 9:
 525 |         compression_level = 9
 526 |     else:
 527 |         compression_level = level
 528 | 
 529 |     return gzip.compress(bytes(batch, "utf-8"), compression_level)
 530 | 
 531 | 
 532 | class ScrubbingRule(object):
 533 |     def __init__(self, regex, placeholder):
 534 |         self.regex = regex
 535 |         self.placeholder = placeholder
 536 | 
 537 | 
 538 | class DatadogScrubber(object):
 539 |     def __init__(self, configs):
 540 |         rules = []
 541 |         for config in configs:
 542 |             if config.name in os.environ:
 543 |                 rules.append(
 544 |                     ScrubbingRule(
 545 |                         compileRegex(config.name, config.pattern), config.placeholder
 546 |                     )
 547 |                 )
 548 |         self._rules = rules
 549 | 
 550 |     def scrub(self, payload):
 551 |         for rule in self._rules:
 552 |             try:
 553 |                 payload = rule.regex.sub(rule.placeholder, payload)
 554 |             except Exception:
 555 |                 raise ScrubbingException()
 556 |         return payload
 557 | 
 558 | 
 559 | def log_has_report_msg(log):
 560 |     msg = log.get("message", "")
 561 |     if isinstance(msg, str) and msg.startswith("REPORT"):
 562 |         return True
 563 |     return False
 564 | 
 565 | 
 566 | def datadog_forwarder(event, context):
 567 |     """The actual lambda function entry point"""
 568 |     metrics, logs, traces = split(enrich(parse(event, context)))
 569 | 
 570 |     if DD_FORWARD_LOG:
 571 |         forward_logs(filter_logs(map(json.dumps, logs)))
 572 | 
 573 |     if DD_FORWARD_METRIC:
 574 |         forward_metrics(metrics)
 575 | 
 576 |     if DD_FORWARD_TRACES and len(traces) > 0:
 577 |         forward_traces(traces)
 578 | 
 579 |     if IS_ENHANCED_METRICS_FILE_PRESENT:
 580 |         report_logs = filter(log_has_report_msg, logs)
 581 |         parse_and_submit_enhanced_metrics(report_logs)
 582 | 
 583 | 
 584 | if DD_FORWARD_METRIC or DD_FORWARD_TRACES:
 585 |     # Datadog Lambda layer is required to forward metrics
 586 |     lambda_handler = datadog_lambda_wrapper(datadog_forwarder)
 587 | else:
 588 |     lambda_handler = datadog_forwarder
 589 | 
 590 | 
 591 | def forward_logs(logs):
 592 |     """Forward logs to Datadog"""
 593 |     scrubber = DatadogScrubber(SCRUBBING_RULE_CONFIGS)
 594 |     if DD_USE_TCP:
 595 |         batcher = DatadogBatcher(256 * 1000, 256 * 1000, 1)
 596 |         cli = DatadogTCPClient(DD_URL, DD_PORT, DD_NO_SSL, DD_API_KEY, scrubber)
 597 |     else:
 598 |         batcher = DatadogBatcher(256 * 1000, 2 * 1000 * 1000, 200)
 599 |         cli = DatadogHTTPClient(
 600 |             DD_URL, DD_PORT, DD_NO_SSL, DD_SKIP_SSL_VALIDATION, DD_API_KEY, scrubber
 601 |         )
 602 | 
 603 |     with DatadogClient(cli) as client:
 604 |         for batch in batcher.batch(logs):
 605 |             try:
 606 |                 client.send(batch)
 607 |             except Exception:
 608 |                 log.exception(f"Exception while forwarding log batch {batch}")
 609 |             else:
 610 |                 log.debug(f"Forwarded {len(batch)} logs")
 611 | 
 612 | 
 613 | def parse(event, context):
 614 |     """Parse Lambda input to normalized events"""
 615 |     metadata = generate_metadata(context)
 616 |     try:
 617 |         # Route to the corresponding parser
 618 |         event_type = parse_event_type(event)
 619 |         if event_type == "s3":
 620 |             events = s3_handler(event, context, metadata)
 621 |         elif event_type == "awslogs":
 622 |             events = awslogs_handler(event, context, metadata)
 623 |         elif event_type == "events":
 624 |             events = cwevent_handler(event, metadata)
 625 |         elif event_type == "sns":
 626 |             events = sns_handler(event, metadata)
 627 |         elif event_type == "kinesis":
 628 |             events = kinesis_awslogs_handler(event, context, metadata)
 629 |     except Exception as e:
 630 |         # Logs through the socket the error
 631 |         err_message = "Error parsing the object. Exception: {} for event {}".format(
 632 |             str(e), event
 633 |         )
 634 |         events = [err_message]
 635 | 
 636 |     return normalize_events(events, metadata)
 637 | 
 638 | 
 639 | def enrich(events):
 640 |     """Adds event-specific tags and attributes to each event
 641 | 
 642 |     Args:
 643 |         events (dict[]): the list of event dicts we want to enrich
 644 |     """
 645 |     for event in events:
 646 |         add_metadata_to_lambda_log(event)
 647 | 
 648 |     return events
 649 | 
 650 | 
 651 | def add_metadata_to_lambda_log(event):
 652 |     """Mutate log dict to add tags, host, and service metadata
 653 | 
 654 |     * tags for functionname, aws_account, region
 655 |     * host from the Lambda ARN
 656 |     * service from the Lambda name
 657 | 
 658 |     If the event arg is not a Lambda log then this returns without doing anything
 659 | 
 660 |     Args:
 661 |         event (dict): the event we are adding Lambda metadata to
 662 |     """
 663 |     lambda_log_metadata = event.get("lambda", {})
 664 |     lambda_log_arn = lambda_log_metadata.get("arn")
 665 | 
 666 |     # Do not mutate the event if it's not from Lambda
 667 |     if not lambda_log_arn:
 668 |         return
 669 | 
 670 |     # Function name is the sixth piece of the ARN
 671 |     function_name = lambda_log_arn.split(":")[6]
 672 | 
 673 |     event[DD_HOST] = lambda_log_arn
 674 |     event[DD_SERVICE] = function_name
 675 | 
 676 |     tags = ["functionname:{}".format(function_name)]
 677 | 
 678 |     # Add any enhanced tags from metadata
 679 |     if IS_ENHANCED_METRICS_FILE_PRESENT:
 680 |         tags += get_enriched_lambda_log_tags(event)
 681 | 
 682 |     # Dedup tags, so we don't end up with functionname twice
 683 |     tags = list(set(tags))
 684 |     tags.sort()  # Keep order deterministic
 685 | 
 686 |     event[DD_CUSTOM_TAGS] = ",".join([event[DD_CUSTOM_TAGS]] + tags)
 687 | 
 688 | 
 689 | def generate_metadata(context):
 690 |     metadata = {
 691 |         "ddsourcecategory": "aws",
 692 |         "aws": {
 693 |             "function_version": context.function_version,
 694 |             "invoked_function_arn": context.invoked_function_arn,
 695 |         },
 696 |     }
 697 |     # Add custom tags here by adding new value with the following format "key1:value1, key2:value2"  - might be subject to modifications
 698 |     dd_custom_tags_data = {
 699 |         "forwardername": context.function_name.lower(),
 700 |         "forwarder_memorysize": context.memory_limit_in_mb,
 701 |         "forwarder_version": DD_FORWARDER_VERSION,
 702 |     }
 703 |     metadata[DD_CUSTOM_TAGS] = ",".join(
 704 |         filter(
 705 |             None,
 706 |             [
 707 |                 DD_TAGS,
 708 |                 ",".join(
 709 |                     ["{}:{}".format(k, v) for k, v in dd_custom_tags_data.items()]
 710 |                 ),
 711 |             ],
 712 |         )
 713 |     )
 714 | 
 715 |     return metadata
 716 | 
 717 | 
 718 | def extract_trace(event):
 719 |     """Extract traces from an event if possible"""
 720 |     try:
 721 |         message = event["message"]
 722 |         obj = json.loads(event["message"])
 723 |         if not "traces" in obj or not isinstance(obj["traces"], list):
 724 |             return None
 725 |         return {"message": message, "tags": event[DD_CUSTOM_TAGS]}
 726 |     except Exception:
 727 |         return None
 728 | 
 729 | 
 730 | def extract_metric(event):
 731 |     """Extract metric from an event if possible"""
 732 |     try:
 733 |         metric = json.loads(event["message"])
 734 |         required_attrs = {"m", "v", "e", "t"}
 735 |         if not all(attr in metric for attr in required_attrs):
 736 |             return None
 737 |         if not isinstance(metric["t"], list):
 738 |             return None
 739 | 
 740 |         metric["t"] += event[DD_CUSTOM_TAGS].split(",")
 741 |         return metric
 742 |     except Exception:
 743 |         return None
 744 | 
 745 | 
 746 | def split(events):
 747 |     """Split events into metrics, logs, and traces
 748 |     """
 749 |     metrics, logs, traces = [], [], []
 750 |     for event in events:
 751 |         metric = extract_metric(event)
 752 |         trace = extract_trace(event)
 753 |         if metric and DD_FORWARD_METRIC:
 754 |             metrics.append(metric)
 755 |         elif trace and DD_FORWARD_TRACES:
 756 |             traces.append(trace)
 757 |         else:
 758 |             logs.append(event)
 759 |     return metrics, logs, traces
 760 | 
 761 | 
 762 | # should only be called when INCLUDE_AT_MATCH and/or EXCLUDE_AT_MATCH exist
 763 | def filter_logs(logs):
 764 |     """
 765 |     Applies log filtering rules.
 766 |     If no filtering rules exist, return all the logs.
 767 |     """
 768 |     if INCLUDE_AT_MATCH is None and EXCLUDE_AT_MATCH is None:
 769 |         # convert to strings
 770 |         return logs
 771 |     # Add logs that should be sent to logs_to_send
 772 |     logs_to_send = []
 773 |     # Test each log for exclusion and inclusion, if the criteria exist
 774 |     for log in logs:
 775 |         try:
 776 |             if EXCLUDE_AT_MATCH is not None:
 777 |                 # if an exclude match is found, do not add log to logs_to_send
 778 |                 if re.search(exclude_regex, log):
 779 |                     continue
 780 |             if INCLUDE_AT_MATCH is not None:
 781 |                 # if no include match is found, do not add log to logs_to_send
 782 |                 if not re.search(include_regex, log):
 783 |                     continue
 784 |             logs_to_send.append(log)
 785 |         except ScrubbingException:
 786 |             raise Exception("could not filter the payload")
 787 |     return logs_to_send
 788 | 
 789 | 
 790 | def forward_metrics(metrics):
 791 |     """
 792 |     Forward custom metrics submitted via logs to Datadog in a background thread
 793 |     using `lambda_stats` that is provided by the Datadog Python Lambda Layer.
 794 |     """
 795 |     for metric in metrics:
 796 |         try:
 797 |             lambda_stats.distribution(
 798 |                 metric["m"], metric["v"], timestamp=metric["e"], tags=metric["t"]
 799 |             )
 800 |         except Exception:
 801 |             log.exception(f"Exception while forwarding metric {metric}")
 802 |         else:
 803 |             log.debug(f"Forwarded metric: {metric}")
 804 | 
 805 | 
 806 | def forward_traces(traces):
 807 |     for trace in traces:
 808 |         try:
 809 |             trace_connection.send_trace(trace["message"], trace["tags"])
 810 |         except Exception:
 811 |             log.exception(f"Exception while forwarding trace {trace}")
 812 |         else:
 813 |             log.debug(f"Forwarded trace: {trace}")
 814 | 
 815 | 
 816 | # Utility functions
 817 | 
 818 | 
 819 | def normalize_events(events, metadata):
 820 |     normalized = []
 821 |     for event in events:
 822 |         if isinstance(event, dict):
 823 |             normalized.append(merge_dicts(event, metadata))
 824 |         elif isinstance(event, str):
 825 |             normalized.append(merge_dicts({"message": event}, metadata))
 826 |         else:
 827 |             # drop this log
 828 |             continue
 829 |     return normalized
 830 | 
 831 | 
 832 | def parse_event_type(event):
 833 |     if "Records" in event and len(event["Records"]) > 0:
 834 |         if "s3" in event["Records"][0]:
 835 |             return "s3"
 836 |         elif "Sns" in event["Records"][0]:
 837 |             return "sns"
 838 |         elif "kinesis" in event["Records"][0]:
 839 |             return "kinesis"
 840 | 
 841 |     elif "awslogs" in event:
 842 |         return "awslogs"
 843 | 
 844 |     elif "detail" in event:
 845 |         return "events"
 846 |     raise Exception("Event type not supported (see #Event supported section)")
 847 | 
 848 | 
 849 | # Handle S3 events
 850 | def s3_handler(event, context, metadata):
 851 |     s3 = boto3.client("s3")
 852 | 
 853 |     # Get the object from the event and show its content type
 854 |     bucket = event["Records"][0]["s3"]["bucket"]["name"]
 855 |     key = urllib.parse.unquote_plus(event["Records"][0]["s3"]["object"]["key"])
 856 | 
 857 |     source = parse_event_source(event, key)
 858 |     metadata[DD_SOURCE] = source
 859 |     ##default service to source value
 860 |     metadata[DD_SERVICE] = source
 861 |     ##Get the ARN of the service and set it as the hostname
 862 |     hostname = parse_service_arn(source, key, bucket, context)
 863 |     if hostname:
 864 |         metadata[DD_HOST] = hostname
 865 | 
 866 |     # Extract the S3 object
 867 |     response = s3.get_object(Bucket=bucket, Key=key)
 868 |     body = response["Body"]
 869 |     data = body.read()
 870 | 
 871 |     # Decompress data that has a .gz extension or magic header http://www.onicos.com/staff/iz/formats/gzip.html
 872 |     if key[-3:] == ".gz" or data[:2] == b"\x1f\x8b":
 873 |         with gzip.GzipFile(fileobj=BytesIO(data)) as decompress_stream:
 874 |             # Reading line by line avoid a bug where gzip would take a very long time (>5min) for
 875 |             # file around 60MB gzipped
 876 |             data = b"".join(BufferedReader(decompress_stream))
 877 | 
 878 |     if is_cloudtrail(str(key)):
 879 |         cloud_trail = json.loads(data)
 880 |         for event in cloud_trail["Records"]:
 881 |             # Create structured object and send it
 882 |             structured_line = merge_dicts(
 883 |                 event, {"aws": {"s3": {"bucket": bucket, "key": key}}}
 884 |             )
 885 |             yield structured_line
 886 |     else:
 887 |         # Check if using multiline log regex pattern
 888 |         # and determine whether line or pattern separated logs
 889 |         data = data.decode("utf-8")
 890 |         if DD_MULTILINE_LOG_REGEX_PATTERN and multiline_regex_start_pattern.match(data):
 891 |             split_data = multiline_regex.split(data)
 892 |         else:
 893 |             split_data = data.splitlines()
 894 | 
 895 |         # Send lines to Datadog
 896 |         for line in split_data:
 897 |             # Create structured object and send it
 898 |             structured_line = {
 899 |                 "aws": {"s3": {"bucket": bucket, "key": key}},
 900 |                 "message": line,
 901 |             }
 902 |             yield structured_line
 903 | 
 904 | 
 905 | # Handle CloudWatch logs from Kinesis
 906 | def kinesis_awslogs_handler(event, context, metadata):
 907 |     def reformat_record(record):
 908 |         return {"awslogs": {"data": record["kinesis"]["data"]}}
 909 | 
 910 |     return itertools.chain.from_iterable(
 911 |         awslogs_handler(reformat_record(r), context, metadata) for r in event["Records"]
 912 |     )
 913 | 
 914 | 
 915 | # Handle CloudWatch logs
 916 | def awslogs_handler(event, context, metadata):
 917 |     # Get logs
 918 |     with gzip.GzipFile(
 919 |         fileobj=BytesIO(base64.b64decode(event["awslogs"]["data"]))
 920 |     ) as decompress_stream:
 921 |         # Reading line by line avoid a bug where gzip would take a very long
 922 |         # time (>5min) for file around 60MB gzipped
 923 |         data = b"".join(BufferedReader(decompress_stream))
 924 |     logs = json.loads(data)
 925 | 
 926 |     # Set the source on the logs
 927 |     source = logs.get("logGroup", "cloudwatch")
 928 |     metadata[DD_SOURCE] = parse_event_source(event, source)
 929 | 
 930 |     # Default service to source value
 931 |     metadata[DD_SERVICE] = metadata[DD_SOURCE]
 932 | 
 933 |     # Build aws attributes
 934 |     aws_attributes = {
 935 |         "aws": {
 936 |             "awslogs": {
 937 |                 "logGroup": logs["logGroup"],
 938 |                 "logStream": logs["logStream"],
 939 |                 "owner": logs["owner"],
 940 |             }
 941 |         }
 942 |     }
 943 | 
 944 |     # Set host as log group where cloudwatch is source
 945 |     if metadata[DD_SOURCE] == "cloudwatch":
 946 |         metadata[DD_HOST] = aws_attributes["aws"]["awslogs"]["logGroup"]
 947 | 
 948 |     # When parsing rds logs, use the cloudwatch log group name to derive the
 949 |     # rds instance name, and add the log name of the stream ingested
 950 |     if metadata[DD_SOURCE] == "rds":
 951 |         match = rds_regex.match(logs["logGroup"])
 952 |         if match is not None:
 953 |             metadata[DD_HOST] = match.group("host")
 954 |             metadata[DD_CUSTOM_TAGS] = (
 955 |                 metadata[DD_CUSTOM_TAGS] + ",logname:" + match.group("name")
 956 |             )
 957 |             # We can intuit the sourcecategory in some cases
 958 |             if match.group("name") == "postgresql":
 959 |                 metadata[DD_CUSTOM_TAGS] + ",sourcecategory:" + match.group("name")
 960 | 
 961 |     # For Lambda logs we want to extract the function name,
 962 |     # then rebuild the arn of the monitored lambda using that name.
 963 |     # Start by splitting the log group to get the function name
 964 |     if metadata[DD_SOURCE] == "lambda":
 965 |         log_group_parts = logs["logGroup"].split("/lambda/")
 966 |         if len(log_group_parts) > 1:
 967 |             function_name = log_group_parts[1].lower()
 968 |             # Split the arn of the forwarder to extract the prefix
 969 |             arn_parts = context.invoked_function_arn.split("function:")
 970 |             if len(arn_parts) > 0:
 971 |                 arn_prefix = arn_parts[0]
 972 |                 # Rebuild the arn by replacing the function name
 973 |                 arn = arn_prefix + "function:" + function_name
 974 |                 # Add the arn as a log attribute
 975 |                 arn_attributes = {"lambda": {"arn": arn}}
 976 |                 aws_attributes = merge_dicts(aws_attributes, arn_attributes)
 977 | 
 978 |                 env_tag_exists = (
 979 |                     metadata[DD_CUSTOM_TAGS].startswith("env:")
 980 |                     or ",env:" in metadata[DD_CUSTOM_TAGS]
 981 |                 )
 982 |                 # If there is no env specified, default to env:none
 983 |                 if not env_tag_exists:
 984 |                     metadata[DD_CUSTOM_TAGS] += ",env:none"
 985 | 
 986 |     # Create and send structured logs to Datadog
 987 |     for log in logs["logEvents"]:
 988 |         yield merge_dicts(log, aws_attributes)
 989 | 
 990 | 
 991 | # Handle Cloudwatch Events
 992 | def cwevent_handler(event, metadata):
 993 |     data = event
 994 | 
 995 |     # Set the source on the log
 996 |     source = data.get("source", "cloudwatch")
 997 |     service = source.split(".")
 998 |     if len(service) > 1:
 999 |         metadata[DD_SOURCE] = service[1]
1000 |     else:
1001 |         metadata[DD_SOURCE] = "cloudwatch"
1002 |     ##default service to source value
1003 |     metadata[DD_SERVICE] = metadata[DD_SOURCE]
1004 | 
1005 |     yield data
1006 | 
1007 | 
1008 | # Handle Sns events
1009 | def sns_handler(event, metadata):
1010 |     data = event
1011 |     # Set the source on the log
1012 |     metadata[DD_SOURCE] = parse_event_source(event, "sns")
1013 | 
1014 |     for ev in data["Records"]:
1015 |         # Create structured object and send it
1016 |         structured_line = ev
1017 |         yield structured_line
1018 | 
1019 | 
1020 | def merge_dicts(a, b, path=None):
1021 |     if path is None:
1022 |         path = []
1023 |     for key in b:
1024 |         if key in a:
1025 |             if isinstance(a[key], dict) and isinstance(b[key], dict):
1026 |                 merge_dicts(a[key], b[key], path + [str(key)])
1027 |             elif a[key] == b[key]:
1028 |                 pass  # same leaf value
1029 |             else:
1030 |                 raise Exception(
1031 |                     "Conflict while merging metadatas and the log entry at %s"
1032 |                     % ".".join(path + [str(key)])
1033 |                 )
1034 |         else:
1035 |             a[key] = b[key]
1036 |     return a
1037 | 
1038 | 
1039 | cloudtrail_regex = re.compile(
1040 |     "\d+_CloudTrail_\w{2}-\w{4,9}-\d_\d{8}T\d{4}Z.+.json.gz$", re.I
1041 | )
1042 | 
1043 | 
1044 | def is_cloudtrail(key):
1045 |     match = cloudtrail_regex.search(key)
1046 |     return bool(match)
1047 | 
1048 | 
1049 | def parse_event_source(event, key):
1050 |     if "elasticloadbalancing" in key:
1051 |         return "elb"
1052 |     for source in [
1053 |         "dms",
1054 |         "codebuild",
1055 |         "lambda",
1056 |         "redshift",
1057 |         "cloudfront",
1058 |         "kinesis",
1059 |         "/aws/rds",
1060 |         "mariadb",
1061 |         "mysql",
1062 |         "apigateway",
1063 |         "route53",
1064 |         "vpc",
1065 |         "sns",
1066 |         "waf",
1067 |         "docdb",
1068 |         "fargate",
1069 |     ]:
1070 |         if source in key:
1071 |             return source.replace("/aws/", "")
1072 |     if "api-gateway" in key.lower() or "apigateway" in key.lower():
1073 |         return "apigateway"
1074 |     if is_cloudtrail(str(key)) or (
1075 |         "logGroup" in event and event["logGroup"] == "CloudTrail"
1076 |     ):
1077 |         return "cloudtrail"
1078 |     if "awslogs" in event:
1079 |         return "cloudwatch"
1080 |     if "Records" in event and len(event["Records"]) > 0:
1081 |         if "s3" in event["Records"][0]:
1082 |             return "s3"
1083 | 
1084 |     return "aws"
1085 | 
1086 | 
1087 | def parse_service_arn(source, key, bucket, context):
1088 |     if source == "elb":
1089 |         # For ELB logs we parse the filename to extract parameters in order to rebuild the ARN
1090 |         # 1. We extract the region from the filename
1091 |         # 2. We extract the loadbalancer name and replace the "." by "/" to match the ARN format
1092 |         # 3. We extract the id of the loadbalancer
1093 |         # 4. We build the arn
1094 |         idsplit = key.split("/")
1095 |         # If there is a prefix on the S3 bucket, idsplit[1] will be "AWSLogs"
1096 |         # Remove the prefix before splitting they key
1097 |         if len(idsplit) > 1 and idsplit[1] == "AWSLogs":
1098 |             idsplit = idsplit[1:]
1099 |             keysplit = "/".join(idsplit).split("_")
1100 |         # If no prefix, split the key
1101 |         else:
1102 |             keysplit = key.split("_")
1103 |         if len(keysplit) > 3:
1104 |             region = keysplit[2].lower()
1105 |             name = keysplit[3]
1106 |             elbname = name.replace(".", "/")
1107 |             if len(idsplit) > 1:
1108 |                 idvalue = idsplit[1]
1109 |                 return "arn:aws:elasticloadbalancing:{}:{}:loadbalancer/{}".format(
1110 |                     region, idvalue, elbname
1111 |                 )
1112 |     if source == "s3":
1113 |         # For S3 access logs we use the bucket name to rebuild the arn
1114 |         if bucket:
1115 |             return "arn:aws:s3:::{}".format(bucket)
1116 |     if source == "cloudfront":
1117 |         # For Cloudfront logs we need to get the account and distribution id from the lambda arn and the filename
1118 |         # 1. We extract the cloudfront id  from the filename
1119 |         # 2. We extract the AWS account id from the lambda arn
1120 |         # 3. We build the arn
1121 |         namesplit = key.split("/")
1122 |         if len(namesplit) > 0:
1123 |             filename = namesplit[len(namesplit) - 1]
1124 |             # (distribution-ID.YYYY-MM-DD-HH.unique-ID.gz)
1125 |             filenamesplit = filename.split(".")
1126 |             if len(filenamesplit) > 3:
1127 |                 distributionID = filenamesplit[len(filenamesplit) - 4].lower()
1128 |                 arn = context.invoked_function_arn
1129 |                 arnsplit = arn.split(":")
1130 |                 if len(arnsplit) == 7:
1131 |                     awsaccountID = arnsplit[4].lower()
1132 |                     return "arn:aws:cloudfront::{}:distribution/{}".format(
1133 |                         awsaccountID, distributionID
1134 |                     )
1135 |     if source == "redshift":
1136 |         # For redshift logs we leverage the filename to extract the relevant information
1137 |         # 1. We extract the region from the filename
1138 |         # 2. We extract the account-id from the filename
1139 |         # 3. We extract the name of the cluster
1140 |         # 4. We build the arn: arn:aws:redshift:region:account-id:cluster:cluster-name
1141 |         namesplit = key.split("/")
1142 |         if len(namesplit) == 8:
1143 |             region = namesplit[3].lower()
1144 |             accountID = namesplit[1].lower()
1145 |             filename = namesplit[7]
1146 |             filesplit = filename.split("_")
1147 |             if len(filesplit) == 6:
1148 |                 clustername = filesplit[3]
1149 |                 return "arn:aws:redshift:{}:{}:cluster:{}:".format(
1150 |                     region, accountID, clustername
1151 |                 )
1152 |     return
1153 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | import lambda_function
 2 | import fingerprint
 3 | 
 4 | def my_enrich_slow_logs(events):
 5 |     for event in events:
 6 |         messages = []
 7 |         sql = ''
 8 |         
 9 |         for line in event['message'].split("\n"):
10 |             if line.startswith("#"):
11 |                 messages.append(line)
12 |             elif line.startswith("SET timestamp="):
13 |                 pass
14 |             elif line.startswith("use "):
15 |                 pass
16 |             else:
17 |                 sql = sql + line
18 |         
19 |         messages.append(fingerprint.fingerprint(sql))
20 | 
21 |         event['message'] = "\n".join(messages)
22 | 
23 |         lambda_function.add_metadata_to_lambda_log(event)
24 |     return events
25 | 
26 | def lambda_handler(event, context):
27 |     lambda_function.enrich = my_enrich_slow_logs
28 | 
29 |     return lambda_function.lambda_handler(event, context)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/tests/test_fingerprint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append('src/')
 5 | 
 6 | import unittest
 7 | from fingerprint import fingerprint
 8 | 
 9 | 
10 | class TestFinterprint(unittest.TestCase):
11 | 
12 | 
13 |     def test_sql_simple(self):
14 |         self.assertEqual(fingerprint('SELECT * FROM tbl WHERE col1 = "abc"'),
15 |                          "select * from tbl where col1 = ?")
16 | 
17 |     def test_sql_simple2(self):
18 |         self.assertEqual(fingerprint('SELECT * FROM tbl WHERE col1 = 123'),
19 |                          "select * from tbl where col1 = ?")
20 | 
21 |     def test_sql_wherein(self):
22 |         self.assertEqual(fingerprint('SELECT * FROM tbl WHERE id IN ("a", "b", 123)'),
23 |                          "select * from tbl where id in(?+)")
24 | 
25 |     def test_sql_japanese(self):
26 |         self.assertEqual(fingerprint('SELECT * FROM tbl WHERE col1 LIKE "%ソ%"'),
27 |                          "select * from tbl where col1 like ?")
28 | 
29 |     def test_sql_multiline(self):
30 |         self.assertEqual(fingerprint("SELECT col1, created_at FROM tbl\nWHERE col1 like 'abc%'"),
31 |                          "select col1, created_at from tbl where col1 like ?")
32 | 
33 |     def test_sql_limit(self):
34 |         self.assertEqual(fingerprint('SELECT * FROM tbl WHERE col1 = "abc" LIMIT 10'),
35 |                          "select * from tbl where col1 = ? limit ?")
36 | 
37 |     def test_sql_call(self):
38 |         self.assertEqual(fingerprint("CALL MYFUNCTION(123)"),
39 |                          "call myfunction(?)")
40 | 
41 |     def test_sql_long(self):
42 |         self.assertEqual(fingerprint("SELECT *, sleep(1) from tbl where pk = 1 or pk = 2 or pk = 3 or pk = 4 or pk = 5 or pk = 6 or pk = 7 or pk = 8 or pk = 9 or pk = 10 or pk = 11"),
43 |                          "select *, sleep(?) from tbl where pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ? or pk = ?")
44 | 


--------------------------------------------------------------------------------