├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ROADMAP.MD
├── alarm_context_tool
    ├── LICENSE
    ├── api_gateway_handler.py
    ├── application_elb_handler.py
    ├── dynamodb_handler.py
    ├── ec2_handler.py
    ├── ecs_handler.py
    ├── eks_handler.py
    ├── functions.py
    ├── functions_alarm.py
    ├── functions_bedrock.py
    ├── functions_cloudformation.py
    ├── functions_email.py
    ├── functions_health.py
    ├── functions_logs.py
    ├── functions_metrics.py
    ├── functions_xray.py
    ├── health_client.py
    ├── lambda_function.py
    ├── lambda_handler.py
    ├── rds_handler.py
    ├── region_lookup.py
    ├── s3_handler.py
    ├── sns_handler.py
    ├── ssm_run_command_handler.py
    └── synthetics_handler.py
├── dependencies_layer
    └── requirements.txt
├── samconfig.toml
└── template.yaml


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
  3 | 
  4 | ### Linux ###
  5 | *~
  6 | 
  7 | # temporary files which can be created if a process still has a handle open of a deleted file
  8 | .fuse_hidden*
  9 | 
 10 | # KDE directory preferences
 11 | .directory
 12 | 
 13 | # Linux trash folder which might appear on any partition or disk
 14 | .Trash-*
 15 | 
 16 | # .nfs files are created when an open file is removed but is still being accessed
 17 | .nfs*
 18 | 
 19 | ### OSX ###
 20 | *.DS_Store
 21 | .AppleDouble
 22 | .LSOverride
 23 | 
 24 | # Icon must end with two \r
 25 | Icon
 26 | 
 27 | # Thumbnails
 28 | ._*
 29 | 
 30 | # Files that might appear in the root of a volume
 31 | .DocumentRevisions-V100
 32 | .fseventsd
 33 | .Spotlight-V100
 34 | .TemporaryItems
 35 | .Trashes
 36 | .VolumeIcon.icns
 37 | .com.apple.timemachine.donotpresent
 38 | 
 39 | # Directories potentially created on remote AFP share
 40 | .AppleDB
 41 | .AppleDesktop
 42 | Network Trash Folder
 43 | Temporary Items
 44 | .apdisk
 45 | 
 46 | ### PyCharm ###
 47 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 48 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 49 | 
 50 | # User-specific stuff:
 51 | .idea/**/workspace.xml
 52 | .idea/**/tasks.xml
 53 | .idea/dictionaries
 54 | 
 55 | # Sensitive or high-churn files:
 56 | .idea/**/dataSources/
 57 | .idea/**/dataSources.ids
 58 | .idea/**/dataSources.xml
 59 | .idea/**/dataSources.local.xml
 60 | .idea/**/sqlDataSources.xml
 61 | .idea/**/dynamic.xml
 62 | .idea/**/uiDesigner.xml
 63 | 
 64 | # Gradle:
 65 | .idea/**/gradle.xml
 66 | .idea/**/libraries
 67 | 
 68 | # CMake
 69 | cmake-build-debug/
 70 | 
 71 | # Mongo Explorer plugin:
 72 | .idea/**/mongoSettings.xml
 73 | 
 74 | ## File-based project format:
 75 | *.iws
 76 | 
 77 | ## Plugin-specific files:
 78 | 
 79 | # IntelliJ
 80 | /out/
 81 | 
 82 | # mpeltonen/sbt-idea plugin
 83 | .idea_modules/
 84 | 
 85 | # JIRA plugin
 86 | atlassian-ide-plugin.xml
 87 | 
 88 | # Cursive Clojure plugin
 89 | .idea/replstate.xml
 90 | 
 91 | # Ruby plugin and RubyMine
 92 | /.rakeTasks
 93 | 
 94 | # Crashlytics plugin (for Android Studio and IntelliJ)
 95 | com_crashlytics_export_strings.xml
 96 | crashlytics.properties
 97 | crashlytics-build.properties
 98 | fabric.properties
 99 | 
100 | ### PyCharm Patch ###
101 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
102 | 
103 | # *.iml
104 | # modules.xml
105 | # .idea/misc.xml
106 | # *.ipr
107 | 
108 | # Sonarlint plugin
109 | .idea/sonarlint
110 | 
111 | ### Python ###
112 | # Byte-compiled / optimized / DLL files
113 | __pycache__/
114 | *.py[cod]
115 | *$py.class
116 | 
117 | # C extensions
118 | *.so
119 | 
120 | # Distribution / packaging
121 | .Python
122 | build/
123 | develop-eggs/
124 | dist/
125 | downloads/
126 | eggs/
127 | .eggs/
128 | lib/
129 | lib64/
130 | parts/
131 | sdist/
132 | var/
133 | wheels/
134 | *.egg-info/
135 | .installed.cfg
136 | *.egg
137 | 
138 | # PyInstaller
139 | #  Usually these files are written by a python script from a template
140 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
141 | *.manifest
142 | *.spec
143 | 
144 | # Installer logs
145 | pip-log.txt
146 | pip-delete-this-directory.txt
147 | 
148 | # Unit test / coverage reports
149 | htmlcov/
150 | .tox/
151 | .coverage
152 | .coverage.*
153 | .cache
154 | .pytest_cache/
155 | nosetests.xml
156 | coverage.xml
157 | *.cover
158 | .hypothesis/
159 | 
160 | # Translations
161 | *.mo
162 | *.pot
163 | 
164 | # Flask stuff:
165 | instance/
166 | .webassets-cache
167 | 
168 | # Scrapy stuff:
169 | .scrapy
170 | 
171 | # Sphinx documentation
172 | docs/_build/
173 | 
174 | # PyBuilder
175 | target/
176 | 
177 | # Jupyter Notebook
178 | .ipynb_checkpoints
179 | 
180 | # pyenv
181 | .python-version
182 | 
183 | # celery beat schedule file
184 | celerybeat-schedule.*
185 | 
186 | # SageMath parsed files
187 | *.sage.py
188 | 
189 | # Environments
190 | .env
191 | .venv
192 | env/
193 | venv/
194 | ENV/
195 | env.bak/
196 | venv.bak/
197 | 
198 | # Spyder project settings
199 | .spyderproject
200 | .spyproject
201 | 
202 | # Rope project settings
203 | .ropeproject
204 | 
205 | # mkdocs documentation
206 | /site
207 | 
208 | # mypy
209 | .mypy_cache/
210 | 
211 | ### VisualStudioCode ###
212 | .vscode/*
213 | !.vscode/settings.json
214 | !.vscode/tasks.json
215 | !.vscode/launch.json
216 | !.vscode/extensions.json
217 | .history
218 | .VSCodeCounter/*
219 | 
220 | ### Windows ###
221 | # Windows thumbnail cache files
222 | Thumbs.db
223 | ehthumbs.db
224 | ehthumbs_vista.db
225 | 
226 | # Folder config file
227 | Desktop.ini
228 | 
229 | # Recycle Bin used on file shares
230 | $RECYCLE.BIN/
231 | 
232 | # Windows Installer files
233 | *.cab
234 | *.msi
235 | *.msm
236 | *.msp
237 | 
238 | # Windows shortcuts
239 | *.lnk
240 | 
241 | # End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
242 | 
243 | # Build folder
244 | 
245 | */build/*
246 | 
247 | # AWS SAM
248 | .aws-sam/*
249 | 
250 | # GitHub
251 | .github/*


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/alarm-context-tool/5292f607b70453ac1aa48f15bdf45d3fa48ec95e/CHANGELOG.md


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Alarm Context Tool (ACT)
  2 | 
  3 | The Alarm Context Tool (ACT) enhances AWS CloudWatch Alarms by providing additional context to aid in troubleshooting and analysis. By leveraging AWS services such as Lambda, CloudWatch, X-Ray, and Amazon Bedrock, this solution aggregates and analyzes metrics, logs, and traces to generate meaningful insights. Using generative AI capabilities from Amazon Bedrock, it summarizes findings, identifies potential root causes, and offers relevant documentation links to help operators resolve issues more efficiently. The implementation is designed for easy deployment and integration into existing observability pipelines, significantly reducing response times and improving root cause analysis.
  4 | 
  5 | ## Table of Contents
  6 | - [Dependencies](#prerequisites)
  7 | - [Prerequisites](#dependencies)
  8 | - [Setup](#setup)
  9 | - [Deployment](#deployment)
 10 | - [Usage](#usage)
 11 | - [Creating a New Handler](#creating-a-new-handler)
 12 | - [Testing](#testing)
 13 | - [Environment Variables](#environment-variables)
 14 | - [Available Functions](#some-of-the-available-functions)
 15 | - [Security](#security)
 16 | - [License](#license)
 17 | 
 18 | ## Prerequisites
 19 | 1. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) configured with appropriate permissions.
 20 | 2. [Python 3.12](https://www.python.org/downloads/) or later if you plan to use your IDE to detect problems in the code.
 21 | 3. [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html) for deployment.
 22 | 4. [Access to Anthropic Bedrock foundation models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html)
 23 | 	- Supports Anthropic Claude Models:
 24 | 		- Anthropic Claude Instant v1.2
 25 | 		- Anthropic Claude 2 v2
 26 | 		- Anthropic Claude 2 v2.1
 27 | 		- Anthropic Claude 3 Sonnet
 28 | 		- Anthropic Claude 3 Haiku
 29 | 		- Anthropic Claude 3 Opus
 30 | 5. [Verified identity in Amazon SES](https://docs.aws.amazon.com/ses/latest/dg/verify-addresses-and-domains.html)
 31 | 
 32 | ## Dependencies
 33 | - [markdown](https://pypi.org/project/Markdown/)
 34 | - [boto3](https://pypi.org/project/boto3/)
 35 | - [pandas](https://pypi.org/project/pandas/)
 36 | - [dnspython](https://pypi.org/project/dnspython/)
 37 | - [PyYAML](https://pypi.org/project/PyYAML/)
 38 | - [cfn_flip](https://pypi.org/project/cfn-flip/)
 39 | 
 40 | ## Setup
 41 | 1. Clone the repository:
 42 |     ```sh
 43 |     git clone https://github.com/aws-samples/alarm-context-tool
 44 |     cd alarm-context-tool
 45 |     ```
 46 | 
 47 | 1. Install dependencies if you plan to use your IDE to detect problems in the code:
 48 |     ```sh
 49 |     pip install -r ./dependencies_layer/requirements.txt
 50 |     pip install aws_lambda_powertools 
 51 |     ```
 52 | 
 53 | 1. For some regions, you may need to change the layer version for Lambda Insights after the colon in template.yaml. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Lambda-Insights-extension-versionsx86-64.html.
 54 |     ```yaml
 55 |     - !Sub arn:aws:lambda:${AWS::Region}:580247275435:layer:LambdaInsightsExtension:49
 56 |     ```
 57 | 
 58 | 1. Edit the template.yaml file with the recipient email address and sender address.
 59 | 
 60 |   ```yaml
 61 |   Resources:
 62 |     AlarmContextFunction:
 63 |       Type: AWS::Serverless::Function
 64 |         Handler: lambda_function.alarm_handler
 65 |         Runtime: python3.12
 66 |         Environment:
 67 |           Variables:
 68 |             RECIPIENT: alias@domain.com
 69 |             SENDER: Name <alias@domain.com>
 70 |   ```
 71 | 
 72 | 1. Update additional [Environment Variables](#environment-variables) if required
 73 | 
 74 | 1. Update your SNS Topics that receive notifications from CloudWatch alarms:
 75 | 	- Protocol: AWS Lambda
 76 | 	- Endpoint: ARN of your Lambda function
 77 | 
 78 | ## Deployment
 79 | 1. Use a guided deployment to start with:
 80 |     ```sh
 81 |     sam build
 82 |     sam deploy --guided
 83 |     ```
 84 | 
 85 | 2. Subsequently, you can build, deploy and test using the following command:
 86 |     The test-event must be shared. See Testing
 87 |     ```sh
 88 |     sam build; sam deploy --no-confirm-changeset; sam remote invoke --stack-name alarm-context-tool --region <aws-region> --test-event-name <test-event>
 89 |     ```
 90 | 
 91 | ## Usage
 92 | Once deployed, the Lambda function will be triggered by SNS topics subscribed to CloudWatch Alarms. The function will enhance the alarm message with additional context such as related metrics, logs, and traces. It uses Amazon Bedrock to analyze the gathered data and generate actionable insights.
 93 | 
 94 | ## Creating a New Handler
 95 | To create a new handler for a different AWS service, follow these steps:
 96 | 
 97 | 1. **Create a new handler file**:
 98 |     Create a new Python file in the `handlers` directory. For example, `new_service_handler.py`.
 99 | 
100 | 1. **Define the handler function**:
101 |     Implement the handler function similar to existing handlers. Here's a template:
102 | 
103 |     ```python
104 |     import boto3
105 |     import botocore
106 |     from aws_lambda_powertools import Logger, Tracer
107 | 
108 |     logger = Logger()
109 |     tracer = Tracer()
110 | 
111 |     @tracer.capture_method
112 |     def process_new_service(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
113 |         # Your implementation here
114 |         pass
115 |     ```
116 | 
117 | 1. **Add the handler to the Lambda function**:
118 |     Update `lambda_function.py` to import and call your new handler based on the trigger.
119 | 
120 | 1. **Update the template**:
121 |     Modify `template.yaml` to include your new handler and update necessary permissions.
122 | 
123 |     ```yaml
124 |     Resources:
125 |       AlarmContextFunction:
126 |         Type: AWS::Serverless::Function
127 |           Handler: lambda_function.alarm_handler
128 |           Runtime: python3.12
129 |           Policies:
130 |             - Statement:
131 |                 - Effect: Allow
132 |                   Action:
133 |                     - new-service:Describe*
134 |                   Resource: "*"
135 |     ```
136 | 
137 | 1. **Add necessary permissions**:
138 |     Ensure that your new handler has the required permissions by updating the `template.yaml` file as shown above.
139 | 
140 | ## Testing
141 | 
142 | 1. **Trigger an Alarm**:
143 |   Manually trigger an alarm using the following command, replacing <alarm_name> with the name of your alarm:
144 |     ```sh
145 |     aws cloudwatch set-alarm-state --state-value ALARM --state-reason "Testing" --alarm-name "<alarm_name>"
146 |     ```
147 | 
148 | 1. **Use the test cases generated in the logs**:
149 | The main Lambda function generates a test case that can be used in the [Lambda console](https://console.aws.amazon.com/lambda/). See [Testing Lambda functions in the console](https://docs.aws.amazon.com/lambda/latest/dg/testing-functions.html?icmpid=docs_lambda_help) or by using ```sam remote invoke```.
150 |   1. Open the [CloudWatch console](https://console.aws.amazon.com/cloudwatch/)
151 |   1. In the navigation pane, choose **Logs**, and then choose **Logs Insights**.
152 |   1. In the Select log group(s) drop down, choose **/aws/lambda/alarm-context-tool-AlarmContextFunction-xxxxxxxxxxxx**
153 |   1. Enter the following query, replacing <alarm_name> with the name of your alarm:
154 |       ```
155 |       fields @timestamp, @message, @logStream, @log
156 |       | filter message  = "test_case" AND Records.0.Sns.Message like /<alarm_name>/
157 |       ```
158 |   1. Choose **Run query**
159 |   1. Expand a log entry and copy the entire **@message** field.
160 |   1. You can then use this to test your Lambda function on demand.
161 | 
162 | ## Environment Variables
163 | The following environment variables can be configured for the Lambda function:
164 | 
165 | - `AWS_LAMBDA_LOG_LEVEL`: Sets the log level for AWS Lambda logs (e.g., INFO, DEBUG). Default is `INFO`.
166 | - `ANTHROPIC_VERSION`: Specifies the version of the Anthropic model to be used. Default is `bedrock-2023-05-31`.
167 | - `BEDROCK_MODEL_ID`: The ID of the Amazon Bedrock model to use. Default is `anthropic.claude-3-sonnet-20240229-v1:0`.
168 | - `BEDROCK_REGION`: The AWS region where the Bedrock model is deployed. Default is `us-east-1`.
169 | - `BEDROCK_MAX_TOKENS`: The maximum number of tokens to be used by the Bedrock model. Default is `4000`.
170 | - `METRIC_ROUNDING_PRECISION_FOR_BEDROCK`: The precision for rounding metrics before sending to Bedrock. Default is `3`.
171 | - `POWERTOOLS_LOG_LEVEL`: Sets the log level for AWS Lambda Powertools logs (e.g., INFO, DEBUG). Default is `INFO`.
172 | - `POWERTOOLS_LOGGER_LOG_EVENT`: Enables logging of the full event in Lambda Powertools logs. Default is `True`.
173 | - `POWERTOOLS_SERVICE_NAME`: The name of the service to be used in Lambda Powertools. Default is `Alarm`.
174 | - `POWERTOOLS_TRACER_CAPTURE_RESPONSE`: Controls whether to capture the response in tracing. Default is `False`.
175 | - `RECIPIENT`: The email address to receive notifications. 
176 | - `SENDER`: The sender's email address for notifications. 
177 | - `USE_BEDROCK`: Enables or disables the use of Amazon Bedrock for generative AI. Default is `True`.
178 | 
179 | 
180 | To configure these variables, update the `template.yaml` file:
181 | 
182 | ```yaml
183 | Resources:
184 |   AlarmContextFunction:
185 |     Type: AWS::Serverless::Function
186 |       Handler: lambda_function.alarm_handler
187 |       Runtime: python3.12
188 |       Environment:
189 |         Variables:
190 |           AWS_LAMBDA_LOG_LEVEL: INFO
191 |           ANTHROPIC_VERSION: bedrock-2023-05-31
192 |           BEDROCK_MODEL_ID: anthropic.claude-3-sonnet-20240229-v1:0
193 |           BEDROCK_REGION: us-east-1
194 |           BEDROCK_MAX_TOKENS: 4000
195 |           METRIC_ROUNDING_PRECISION_FOR_BEDROCK: 3
196 |           POWERTOOLS_LOG_LEVEL: INFO
197 |           POWERTOOLS_LOGGER_LOG_EVENT: "True"
198 |           POWERTOOLS_SERVICE_NAME: Alarm
199 |           POWERTOOLS_TRACER_CAPTURE_RESPONSE: "False"
200 |           RECIPIENT: alias@domain.com
201 |           SENDER: Name <alias@domain.com>
202 |           USE_BEDROCK: "True"   
203 | ```
204 | ## Some of the available functions
205 | 
206 | ### Logs Functions (`functions_logs`)
207 | 
208 | - **get_log_insights_link(log_group_name, start_time, end_time, query)**
209 |   - Generates a CloudWatch Logs Insights query link.
210 |   - **Parameters:**
211 |     - `log_group_name` (str): The name of the log group.
212 |     - `start_time` (str): The start time for the query.
213 |     - `end_time` (str): The end time for the query.
214 |     - `query` (str): The Logs Insights query.
215 | 
216 | ### Metrics Functions (`functions_metrics`)
217 | 
218 | - **build_dashboard(dashboard_metrics, annotation_time, start, end, region)**
219 |   - Builds a dashboard with the specified metrics.
220 |   - **Parameters:**
221 |     - `dashboard_metrics` (list): The list of metrics for the dashboard.
222 |     - `annotation_time` (str): The annotation time for the dashboard.
223 |     - `start` (str): The start time for the dashboard.
224 |     - `end` (str): The end time for the dashboard.
225 |     - `region` (str): The AWS region.
226 | 
227 | ### X-Ray Functions (`functions_xray`)
228 | 
229 | - **process_traces(trace_ids, start_time, end_time, region)**
230 |   - Processes X-Ray traces and retrieves trace summaries and details.
231 |   - **Parameters:**
232 |     - `trace_ids` (list): The list of trace IDs to process.
233 |     - `start_time` (str): The start time for the trace processing.
234 |     - `end_time` (str): The end time for the trace processing.
235 |     - `region` (str): The AWS region.
236 | 
237 | ## Security
238 | 
239 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
240 | 
241 | ## License
242 | 
243 | This library is licensed under the MIT-0 License. See the LICENSE file.
244 | 
245 | 


--------------------------------------------------------------------------------
/ROADMAP.MD:
--------------------------------------------------------------------------------
1 | ## Roadmap
2 | - Add support for alarms triggered by Metric Insights Queries
3 |     - Metric Insights Queries require a function to convert the query to dimensions and metric names
4 | - Add bespoke log insights queries to relevant handlers now that log insights queries are supported
5 | - Add agent support for Amazon Bedrock to improve results


--------------------------------------------------------------------------------
/alarm_context_tool/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 4 | software and associated documentation files (the "Software"), to deal in the Software
 5 | without restriction, including without limitation the rights to use, copy, modify,
 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 7 | permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/alarm_context_tool/api_gateway_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | from functions import get_dashboard_button
  5 | from functions import get_html_table
  6 | from functions_logs import get_last_10_events
  7 | from functions_xray import process_traces
  8 | from functions_logs import get_log_insights_link
  9 | from functions_metrics import build_dashboard
 10 | from functions_metrics import get_metrics_from_dashboard_metrics 
 11 | 
 12 | from aws_lambda_powertools import Logger
 13 | from aws_lambda_powertools import Tracer
 14 | logger = Logger()
 15 | tracer = Tracer()
 16 | 
 17 | @tracer.capture_method
 18 | def process_api_gateway(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 19 | 
 20 |     # Dimensions: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-cloudwatch-metrics.html#load-balancer-metric-dimensions-alb
 21 |     
 22 |     if dimensions:
 23 |         dimension_values = {element['name']: element['value'] for element in dimensions}
 24 |         api_name = dimension_values.get('ApiName')
 25 |         api_stage = dimension_values.get('Stage')
 26 |         resource = dimension_values.get('Resource')
 27 |         method = dimension_values.get('Method')
 28 | 
 29 |         link = f'https://{region}.console.aws.amazon.com/apigateway/home?region={region}#/apis/{api_name}/stages/{api_stage}'
 30 |         contextual_links = get_dashboard_button(f"{api_name} stage: {api_stage} details", link)
 31 |             
 32 |         if api_name and api_stage:
 33 |             dashboard_metrics = [
 34 |                 {
 35 |                     "title": "Integration Latency",
 36 |                     "view": "timeSeries",
 37 |                     "stacked": False,
 38 |                     "stat": "Average",
 39 |                     "period": 60,
 40 |                     "metrics": [
 41 |                         [namespace, "IntegrationLatency", "ApiName", api_name, "Stage", api_stage]
 42 |                     ]
 43 |                 },
 44 |                 {
 45 |                     "title": "Latency",
 46 |                     "view": "timeSeries",
 47 |                     "stacked": False,
 48 |                     "stat": "Average",
 49 |                     "period": 60,
 50 |                     "metrics": [
 51 |                         [namespace, "Latency", "ApiName", api_name, "Stage", api_stage]
 52 |                     ]
 53 |                 },
 54 |                 {
 55 |                     "title": "5xx Errors",
 56 |                     "view": "timeSeries",
 57 |                     "stacked": False,
 58 |                     "stat": "Sum",
 59 |                     "period": 60,
 60 |                     "metrics": [
 61 |                         [namespace, "5XXError", "ApiName", api_name, "Stage", api_stage]
 62 |                     ]
 63 |                 },
 64 |                 {
 65 |                     "title": "Request Count",
 66 |                     "view": "timeSeries",
 67 |                     "stacked": False,
 68 |                     "stat": "SampleCount",
 69 |                     "period": 60,
 70 |                     "metrics": [
 71 |                         [namespace, "Count", "ApiName", api_name, "Stage", api_stage]
 72 |                     ]
 73 |                 },
 74 |                 {
 75 |                     "title": "4xx Errors",
 76 |                     "view": "timeSeries",
 77 |                     "stacked": False,
 78 |                     "stat": "Sum",
 79 |                     "period": 60,
 80 |                     "metrics": [
 81 |                         [namespace, "4XXError", "ApiName", api_name, "Stage", api_stage]
 82 |                     ]
 83 |                 }
 84 |             ]
 85 | 
 86 |         elif api_name:
 87 |             dashboard_metrics = [
 88 |                 {
 89 |                     "title": "Integration Latency",
 90 |                     "view": "timeSeries",
 91 |                     "stacked": False,
 92 |                     "stat": "Average",
 93 |                     "period": 60,
 94 |                     "metrics": [
 95 |                         [namespace, "IntegrationLatency", "ApiName", api_name]
 96 |                     ]
 97 |                 },
 98 |                 {
 99 |                     "title": "Latency",
100 |                     "view": "timeSeries",
101 |                     "stacked": False,
102 |                     "stat": "Average",
103 |                     "period": 60,
104 |                     "metrics": [
105 |                         [namespace, "Latency", "ApiName", api_name]
106 |                     ]
107 |                 },
108 |                 {
109 |                     "title": "5xx Errors",
110 |                     "view": "timeSeries",
111 |                     "stacked": False,
112 |                     "stat": "Sum",
113 |                     "period": 60,
114 |                     "metrics": [
115 |                         [namespace, "5XXError", "ApiName", api_name]
116 |                     ]
117 |                 },
118 |                 {
119 |                     "title": "Count",
120 |                     "view": "timeSeries",
121 |                     "stacked": False,
122 |                     "stat": "Sum",
123 |                     "period": 60,
124 |                     "metrics": [
125 |                         [namespace, "Count", "ApiName", api_name]
126 |                     ]
127 |                 },
128 |                 {
129 |                     "title": "4xx Errors",
130 |                     "view": "timeSeries",
131 |                     "stacked": False,
132 |                     "stat": "Sum",
133 |                     "period": 60,
134 |                     "metrics": [
135 |                         [namespace, "4XXError", "ApiName", api_name]
136 |                     ]
137 |                 }
138 |             ]
139 | 
140 |         elif resource and method:
141 |             dashboard_metrics = [
142 |                 { 
143 |                     "title": "Integration Latency",
144 |                     "view": "timeSeries",
145 |                     "stacked": False,
146 |                     "stat": "Average",
147 |                     "period": 60,
148 |                     "metrics": [
149 |                         [namespace, "IntegrationLatency", "ApiName", api_name, "Resource", resource, "Stage", api_stage, "Method", method]
150 |                     ]
151 |                 },
152 |                 {
153 |                     "title": "Latency",
154 |                     "view": "timeSeries",
155 |                     "stacked": False,
156 |                     "stat": "Average",
157 |                     "period": 60,
158 |                     "metrics": [
159 |                         [namespace, "Latency", "ApiName", api_name, "Resource", resource, "Stage", api_stage, "Method", method]
160 |                     ]
161 |                 },
162 |                 {
163 |                     "title": "5xx Errors",
164 |                     "view": "timeSeries",
165 |                     "stacked": False,
166 |                     "stat": "Sum",
167 |                     "period": 60,
168 |                     "metrics": [
169 |                         [namespace, "5XXError", "ApiName", api_name, "Resource", resource, "Stage", api_stage, "Method", method]
170 |                     ]
171 |                 },
172 |                 {
173 |                     "title": "Request Count",
174 |                     "view": "timeSeries",
175 |                     "stacked": False,
176 |                     "stat": "SampleCount",
177 |                     "period": 60,
178 |                     "metrics": [
179 |                         [namespace, "Count", "ApiName", api_name, "Resource", resource, "Stage", api_stage, "Method", method]
180 |                     ]
181 |                 },
182 |                 {
183 |                     "title": "4xx Errors",
184 |                     "view": "timeSeries",
185 |                     "stacked": False,
186 |                     "stat": "Sum",
187 |                     "period": 60,
188 |                     "metrics": [
189 |                         [namespace, "4XXError", "ApiName", api_name, "Resource", resource, "Stage", api_stage, "Method", method]
190 |                     ]
191 |                 }
192 |             ]
193 | 
194 |         widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
195 |         additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)
196 | 
197 |         api_gateway = boto3.client('apigateway', region_name=region)
198 | 
199 |         if api_name:
200 |             try:                
201 |                 paginator = api_gateway.get_paginator('get_rest_apis')
202 |                 apis_list = []
203 |                 for page in paginator.paginate():
204 |                     apis_list.extend(page['items'])
205 |                 response = {'items': apis_list}
206 |             except botocore.exceptions.ClientError as error:
207 |                 logger.exception("Error getting rest apis")
208 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
209 |             except botocore.exceptions.ParamValidationError as error:
210 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))
211 |             
212 |             api_id = next((api['id'] for api in response['items'] if api['name'] == api_name), None)
213 |                                 
214 |         if api_id:
215 |             try:
216 |                 response = api_gateway.get_rest_api(restApiId=api_id)
217 |             except botocore.exceptions.ClientError as error:
218 |                 logger.exception("Error getting rest api")
219 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
220 |             except botocore.exceptions.ParamValidationError as error:
221 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))
222 |             tags = response.get('tags', {})
223 |             resource_information = get_html_table("API Gateway: " + api_name, response) 
224 |             resource_information_object = response  
225 | 
226 |             if api_stage:
227 |                 try:
228 |                     response = api_gateway.get_stage(restApiId=api_id, stageName=api_stage)
229 |                 except botocore.exceptions.ClientError as error:
230 |                     logger.exception("Error getting stage")
231 |                     raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
232 |                 except botocore.exceptions.ParamValidationError as error:
233 |                     raise ValueError('The parameters you provided are incorrect: {}'.format(error))
234 |                 tags = response.get('tags', {})
235 |                 resource_information = get_html_table("API Gateway: " + api_name, response)
236 |                 resource_information_object = response  
237 | 
238 | 
239 |                 destination_arn = None
240 |                 if "accessLogSettings" in response and "destinationArn" in response["accessLogSettings"]:
241 |                     destination_arn = response["accessLogSettings"]["destinationArn"]
242 |                     log_group_name = destination_arn.split(":log-group:")[1]     
243 | 
244 |                     # Get the last 10 log events
245 |                     log_input = {"logGroupName": log_group_name}
246 |                     log_information, log_events =  get_last_10_events(log_input, change_time, region)                                              
247 |                          
248 | 
249 |                     # Log Insights Link
250 |                     log_insights_query = f"""fields @timestamp, @message
251 |                         | sort @timestamp desc
252 |                         | limit 200"""
253 |                     log_insights_link = get_log_insights_link(log_input, log_insights_query, region, start_time, end_time)
254 |                     contextual_links += get_dashboard_button("Log Insights" , log_insights_link)                   
255 |   
256 |                 # Get Trace information            
257 |                 filter_expression = f'!OK and service(id(name: "{api_name}/{api_stage}", type: "AWS::ApiGateway::Stage")) AND service(id(account.id: "{account_id}"))'
258 |                 logger.info("X-Ray Filter Expression", filter_expression=filter_expression)
259 |                 trace_summary, trace = process_traces(filter_expression, region, start_time, end_time)
260 | 
261 |     else:
262 |         contextual_links = None
263 |         log_information = None
264 |         log_events = None
265 |         resource_information = None
266 |         resource_information_object = None
267 |         widget_images = None
268 |         additional_metrics_with_timestamps_removed = None
269 |         trace_summary = None
270 |         trace = None
271 |         notifications = None
272 |         tags = None
273 |     
274 |     return {
275 |         "contextual_links": contextual_links,
276 |         "log_information": log_information,
277 |         "log_events": log_events,
278 |         "resource_information": resource_information,
279 |         "resource_information_object": resource_information_object,
280 |         "notifications": None,
281 |         "widget_images": widget_images,
282 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
283 |         "trace_summary": trace_summary,
284 |         "trace": trace,
285 |         "tags": tags
286 |     }   


--------------------------------------------------------------------------------
/alarm_context_tool/dynamodb_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | from functions import get_dashboard_button
  5 | from functions import get_html_table
  6 | from functions import get_html_table_with_fields
  7 | from functions_metrics import build_dashboard
  8 | from functions_metrics import get_metrics_from_dashboard_metrics 
  9 | from functions_xray import process_traces
 10 | 
 11 | from aws_lambda_powertools import Logger
 12 | from aws_lambda_powertools import Tracer
 13 | logger = Logger()
 14 | tracer = Tracer()
 15 | 
 16 | @tracer.capture_method
 17 | def process_dynamodb(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 18 |     for elements in dimensions:
 19 |         if elements['name'] == 'TableName':
 20 |             id = elements['value']
 21 |             link = 'https://%s.console.aws.amazon.com/dynamodbv2/home?region=%s#table?name=%s&tab=monitoring' % (region, region, str(id))   
 22 |             contextual_links = get_dashboard_button("%s table Monitoring" % (str(id)), link) 
 23 |             link = 'https://%s.console.aws.amazon.com/dynamodbv2/home?region=%s#table?name=%s' % (region, region, str(id))   
 24 |             contextual_links += get_dashboard_button("%s details" % (str(id)), link) 
 25 |             link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/DynamoDB?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 26 |             contextual_links += get_dashboard_button("DynamoDB in ALARM dashboard" , link)
 27 |             
 28 |             dashboard_metrics = [    
 29 |                 {
 30 |                     "title": "Read usage (average units/second)",
 31 |                     "view": "timeSeries",
 32 |                     "stacked": False,
 33 |                     "stat": "Average",
 34 |                     "period": 60,
 35 |                     "metrics": [
 36 |                         [ "AWS/DynamoDB", "ProvisionedReadCapacityUnits", "TableName", id, { "label": "Provisioned", "color": "#E02020", "region": region } ],
 37 |                         [ "AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", id, { "stat": "Sum", "id": "m1", "visible": False, "region": region } ],
 38 |                         [ { "expression": "m1/PERIOD(m1)", "label": "Consumed", "id": "e1", "color": "#0073BB", "region": region } ]
 39 |                     ]
 40 |                 },
 41 |                 {
 42 |                     "title": "Write usage (average units/second)",
 43 |                     "view": "timeSeries",
 44 |                     "stacked": False,
 45 |                     "stat": "Average",
 46 |                     "period": 60,
 47 |                     "metrics": [
 48 |                         [ "AWS/DynamoDB", "ProvisionedWriteCapacityUnits", "TableName", id, { "label": "Provisioned", "color": "#E02020", "region": region } ],
 49 |                         [ "AWS/DynamoDB", "ConsumedWriteCapacityUnits", "TableName", id, { "stat": "Sum", "id": "m1", "visible": False, "region": region } ],
 50 |                         [ { "expression": "m1/PERIOD(m1)", "label": "Consumed", "id": "e1", "color": "#0073BB", "region": region } ]
 51 |                     ]
 52 |                 },
 53 |                 {
 54 |                     "title": "Read throttled requests (count)",
 55 |                     "view": "timeSeries",
 56 |                     "stacked": False,
 57 |                     "stat": "Sum",
 58 |                     "period": 60,
 59 |                         "metrics": [
 60 |                             [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "GetItem", { "color": "#0073BB", "region": region } ],
 61 |                             [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "Scan", { "color": "#FF7F0F", "region": region } ],
 62 |                             [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "Query", { "color": "#2DA02D", "region": region } ],
 63 |                             [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "BatchGetItem", { "color": "#9468BD", "region": region } ]
 64 |                         ]
 65 |                 },
 66 |                 {
 67 |                     "title": "Read throttled events (count)",
 68 |                     "view": "timeSeries",
 69 |                     "stacked": False,
 70 |                     "stat": "Sum",
 71 |                     "period": 60,
 72 |                         "metrics": [
 73 |                             [ "AWS/DynamoDB", "ReadThrottleEvents", "TableName", id, { "label": "Provisioned", "region": region } ]
 74 |                         ]   
 75 |                 },
 76 |                 {
 77 |                     "title": "Write throttled requests (count)",
 78 |                     "view": "timeSeries",
 79 |                     "stacked": False,
 80 |                     "stat": "Sum",
 81 |                     "period": 60,
 82 |                     "metrics": [
 83 |                         [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "PutItem", { "color": "#0073BB", "region": region } ],
 84 |                         [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "UpdateItem", { "color": "#FF7F0F", "region": region } ],
 85 |                         [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "DeleteItem", { "color": "#2DA02D", "region": region } ],
 86 |                         [ "AWS/DynamoDB", "ThrottledRequests", "TableName", id, "Operation", "BatchWriteItem", { "color": "#9468BD", "region": region } ]
 87 |                     ]
 88 |                 },
 89 |                 {
 90 |                     "title": "Write throttled events (count)",
 91 |                     "view": "timeSeries",
 92 |                     "stacked": False,
 93 |                     "stat": "Sum",
 94 |                     "period": 60,
 95 |                     "metrics": [
 96 |                         [ "AWS/DynamoDB", "WriteThrottleEvents", "TableName", id, { "label": "Provisioned", "region": region } ]
 97 |                     ]
 98 |                 }                    
 99 |             ]
100 |             widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
101 |             additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)
102 |             
103 |             # Describe table
104 |             ddb = boto3.client('dynamodb', region_name=region)
105 |             try:
106 |                 response = ddb.describe_table(TableName=id) 
107 |             except botocore.exceptions.ClientError as error:
108 |                 logger.exception("Error describing DynamoDB table")
109 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
110 |             except botocore.exceptions.ParamValidationError as error:
111 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))
112 |                             
113 |             resource_information = get_html_table("DynamoDB Table: " +id, response['Table'])  
114 |             resource_information_object = response['Table']
115 | 
116 |             # Get Tags
117 |             try:
118 |                 response = ddb.list_tags_of_resource(ResourceArn=response['Table']['TableArn']) 
119 |             except botocore.exceptions.ClientError as error:
120 |                 logger.exception("Error listing DynamoDB tags")
121 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
122 |             except botocore.exceptions.ParamValidationError as error:
123 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))            
124 |             logger.info("DynamoDB Tags" , extra=response)
125 |             resource_information += get_html_table_with_fields("DynamoDB Table Tags: " +id, response['Tags'])  
126 |             tags = response.get('Tags', None)
127 |             
128 |             # Get Trace information            
129 |             filter_expression = f'!OK and service(id(name: "{id}", type: "AWS::DynamoDB::Table")) AND service(id(account.id: "{account_id}"))'
130 |             logger.info("X-Ray Filter Expression", filter_expression=filter_expression)
131 |             trace_summary, trace = process_traces(filter_expression, region, start_time, end_time)
132 |         else:
133 |             contextual_links = None
134 |             log_information = None
135 |             log_events = None
136 |             resource_information = None
137 |             resource_information_object = None
138 |             widget_images = None
139 |             additional_metrics_with_timestamps_removed = None
140 |             trace_summary = None
141 |             trace = None
142 |             notifications = None
143 |             tags = None
144 |     
145 |     return {
146 |         "contextual_links": contextual_links,
147 |         "log_information": None,
148 |         "log_events": None,
149 |         "resource_information": resource_information,
150 |         "resource_information_object": resource_information_object,
151 |         "notifications": None,
152 |         "widget_images": widget_images,
153 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
154 |         "trace_summary": trace_summary,
155 |         "trace": trace,
156 |         "tags": tags
157 |     }


--------------------------------------------------------------------------------
/alarm_context_tool/ecs_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore 
  3 | 
  4 | from functions import get_dashboard_button
  5 | from functions import get_html_table
  6 | from functions_logs import get_last_10_events
  7 | from functions_logs import get_log_insights_link
  8 | from functions_metrics import build_dashboard
  9 | from functions_metrics import get_metrics_from_dashboard_metrics
 10 | from functions_xray import process_traces
 11 | from functions import get_information_panel
 12 | 
 13 | from aws_lambda_powertools import Logger
 14 | from aws_lambda_powertools import Tracer
 15 | logger = Logger()
 16 | tracer = Tracer()
 17 | 
 18 | @tracer.capture_method
 19 | def process_ecs(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 20 |     
 21 |     # Initialize variables
 22 |     contextual_links = ""
 23 |     log_information = ""
 24 |     log_events = ""
 25 |     resource_information = ""
 26 |     resource_information_object = {}
 27 |     widget_images = []
 28 |     additional_metrics_with_timestamps_removed = []
 29 |     trace_summary = None
 30 |     trace = None
 31 |     notifications = ""
 32 | 
 33 |     # Required in case Service appears before Cluster in dimensions
 34 |     for elements in dimensions:
 35 |         if elements['name'] == 'ServiceName':
 36 |             service_name = elements['value']
 37 |         elif elements['name'] == 'ClusterName':  
 38 |             cluster_name = elements['value']    
 39 | 
 40 |     ecs_automatic_dashboard_link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/ECS?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 41 |     contextual_links = get_dashboard_button("ECS automatic dashboard" , ecs_automatic_dashboard_link) 
 42 | 
 43 |     for elements in dimensions:
 44 |         if elements['name'] == 'ClusterName':
 45 |             id = elements['value']
 46 |             cluster_name = id
 47 | 
 48 |             # Describe ECS Cluster
 49 |             ecs = boto3.client('ecs', region_name=region)  
 50 |             try:
 51 |                 response = ecs.describe_clusters(clusters=[id],include=['SETTINGS','STATISTICS','TAGS'])
 52 |             except botocore.exceptions.ClientError as error:
 53 |                 logger.exception("Error describing ECS Cluster")
 54 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 55 |             except botocore.exceptions.ParamValidationError as error:
 56 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))                        
 57 |                           
 58 |             resource_information += get_html_table("ECS Cluster: " +id, response['clusters'][0])
 59 |             resource_information_object.update(response['clusters'][0])
 60 | 
 61 |             # Check if Container Insights is enabled
 62 |             container_insights_enabled = any(
 63 |                 sub_element.get('name') == 'containerInsights' and sub_element.get('value') == 'enabled'
 64 |                 for sub_element in response['clusters'][0]['settings']
 65 |             )
 66 |                         
 67 |                         
 68 |             if container_insights_enabled:
 69 |                 container_insights_link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#container-insights:performance/ECS:Cluster?~(query~(controls~(CW*3a*3aECS.cluster~(~\'%s)))~context~())' % (region, region, str(id))   
 70 |                 container_insights_title = '<b>Container Insights:</b> %s' % (str(id))
 71 |                 contextual_links += get_dashboard_button(container_insights_title , container_insights_link)   
 72 |                 
 73 |                 container_insights_namespace = 'ECS/ContainerInsights'
 74 |                 container_insights_dimensions = 'ClusterName'
 75 |                 
 76 |                 dashboard_metrics = [
 77 |                     {
 78 |                         "title": id + " Container Instance Count",
 79 |                         "view": "singleValue",
 80 |                         "stacked": False,
 81 |                         "stat": "Average",
 82 |                         "period": 60,
 83 |                         "metrics": [
 84 |                             [container_insights_namespace, "ContainerInstanceCount", container_insights_dimensions, id]
 85 |                         ]
 86 |                     },
 87 |                     {
 88 |                         "title": id + " Task Count",
 89 |                         "view": "singleValue",
 90 |                         "stacked": False,
 91 |                         "stat": "Average",
 92 |                         "period": 60,
 93 |                         "metrics": [
 94 |                             [container_insights_namespace, "TaskCount", container_insights_dimensions, id]
 95 |                         ]
 96 |                     },
 97 |                     {
 98 |                         "title": id + " Service Count",
 99 |                         "view": "singleValue",
100 |                         "stacked": False,
101 |                         "stat": "Average",
102 |                         "period": 60,
103 |                         "metrics": [
104 |                             [container_insights_namespace, "ServiceCount", container_insights_dimensions, id]
105 |                         ]
106 |                     },
107 |                     {
108 |                         "title": id + " CPU Utilized",
109 |                         "view": "timeSeries",
110 |                         "stacked": False,
111 |                         "stat": "Average",
112 |                         "period": 60,
113 |                         "yAxis": {
114 |                             "left": {
115 |                                 "min": 0,
116 |                                 "showUnits": False,
117 |                                 "label": "Percent"
118 |                             }
119 |                         },                        
120 |                         "metrics": [
121 |                             [ { "id": "expr1m0", "label": id, "expression": "mm1m0 * 100 / mm0m0", "stat": "Average", "region": region } ],
122 |                             [ container_insights_namespace, "CpuReserved", container_insights_dimensions, id, { "id": "mm0m0", "visible": False, "stat": "Sum", "region": region } ],
123 |                             [ ".", "CpuUtilized", ".", ".", { "id": "mm1m0", "visible": False, "stat": "Sum", "region": region } ]
124 |                         ]
125 |                     },
126 |                     {
127 |                         "title": id + " Memory Utilized",
128 |                         "view": "timeSeries",
129 |                         "stacked": False,
130 |                         "stat": "Average",
131 |                         "period": 60,
132 |                         "yAxis": {
133 |                             "left": {
134 |                                 "min": 0,
135 |                                 "showUnits": False,
136 |                                 "label": "Percent"
137 |                             }
138 |                         },                           
139 |                         "metrics": [
140 |                             [ { "id": "expr1m0", "label": id, "expression": "mm1m0 * 100 / mm0m0", "stat": "Average", "region": region } ],
141 |                             [ container_insights_namespace, "MemoryReserved", container_insights_dimensions, id, { "id": "mm0m0", "visible": False, "stat": "Sum", "region": region } ],
142 |                             [ ".", "MemoryUtilized", ".", ".", { "id": "mm1m0", "visible": False, "stat": "Sum", "region": region } ]
143 |                         ]
144 |                     },
145 |                     {
146 |                         "title": id + " Ephemeral Storage Utilized",
147 |                         "view": "timeSeries",
148 |                         "stacked": False,
149 |                         "stat": "Average",
150 |                         "period": 60,
151 |                         "yAxis": {
152 |                             "left": {
153 |                                 "min": 0,
154 |                                 "showUnits": False,
155 |                                 "label": "Percent"
156 |                             }
157 |                         },                           
158 |                         "metrics": [
159 |                             [ { "id": "expr1m0", "label": id, "expression": "mm1m0 * 100 / mm0m0", "stat": "Average", "region": region } ],
160 |                             [ container_insights_namespace, "EphemeralStorageReserved", container_insights_dimensions, id, { "id": "mm0m0", "visible": False, "stat": "Sum", "region": region } ],
161 |                             [ ".", "EphemeralStorageUtilized", ".", ".", { "id": "mm1m0", "visible": False, "stat": "Sum", "region": region } ]
162 |                         ]
163 |                     },
164 |                     {
165 |                         "title": id + " Network Tx Bytes",
166 |                         "view": "timeSeries",
167 |                         "stacked": False,
168 |                         "stat": "Average",
169 |                         "period": 60,
170 |                         "metrics": [
171 |                             [container_insights_namespace, "NetworkTxBytes", container_insights_dimensions, id]
172 |                         ]
173 |                     },
174 |                     {
175 |                         "title": id + " Network Rx Bytes",
176 |                         "view": "timeSeries",
177 |                         "stacked": False,
178 |                         "stat": "Average",
179 |                         "period": 60,
180 |                         "metrics": [
181 |                             [container_insights_namespace, "NetworkRxBytes", container_insights_dimensions, id]
182 |                         ]
183 |                     },
184 |                     {
185 |                         "title": id + " Container Instance Count",
186 |                         "view": "timeSeries",
187 |                         "stacked": False,
188 |                         "stat": "Average",
189 |                         "period": 60,
190 |                         "metrics": [
191 |                             [container_insights_namespace, "ContainerInstanceCount", container_insights_dimensions, id]
192 |                         ]
193 |                     },    
194 |                     {
195 |                         "title": id + " Task Count",
196 |                         "view": "timeSeries",
197 |                         "stacked": False,
198 |                         "stat": "Average",
199 |                         "period": 60,
200 |                         "metrics": [
201 |                             [container_insights_namespace, "TaskCount", container_insights_dimensions, id]
202 |                         ]
203 |                     },
204 |                     {
205 |                         "title": id + " Service Count",
206 |                         "view": "timeSeries",
207 |                         "stacked": False,
208 |                         "stat": "Average",
209 |                         "period": 60,
210 |                         "metrics": [
211 |                             [container_insights_namespace, "ServiceCount", container_insights_dimensions, id]
212 |                         ]
213 |                     }
214 |                 ]
215 |                 widget_images.extend(build_dashboard(dashboard_metrics, annotation_time, start, end, region))
216 |                 additional_metrics_with_timestamps_removed.extend(get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region))
217 |             else:
218 |                 panel_title = "You do not have Container Insights enabled for this cluster"
219 |                 panel_content =  f''' 
220 |                     Use CloudWatch Container Insights to collect, aggregate, and summarize metrics and logs from your containerized applications and microservices.
221 |                     <a href="https://{region}.console.aws.amazon.com/ecs/v2/account-settings/account-settings-edit?region={region}">Enable Container Insights</a>'
222 |                 '''
223 |                 notifications = get_information_panel(panel_title, panel_content)      
224 |             ecs_link = 'https://%s.console.aws.amazon.com/ecs/v2/clusters/%s/services?region=%s' % (region, str(id), region)   
225 |             ecs_title = '<b>ECS Console:</b> %s' % (str(id))
226 |             contextual_links += get_dashboard_button(ecs_title , ecs_link)                 
227 |       
228 |         elif elements['name'] == 'ServiceName':
229 |             id = elements['value']      
230 |         
231 |             # Describe ECS Service
232 |             ecs = boto3.client('ecs', region_name=region)  
233 |             try:
234 |                 response = ecs.describe_services(cluster=cluster_name,services=[id],include=['TAGS'])
235 |             except botocore.exceptions.ClientError as error:
236 |                 logger.exception("Error describing ECS Service")
237 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
238 |             except botocore.exceptions.ParamValidationError as error:
239 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))                        
240 |                           
241 |             resource_information += get_html_table("ECS Service: " +id, response['services'][0])
242 |             resource_information_object.update(response['services'][0])
243 |        
244 |             
245 |             ecs_service_link = 'https://%s.console.aws.amazon.com/ecs/v2/clusters/%s/services/%s/health?region=%s ' % (region, cluster_name, str(id), region)   
246 |             ecs_service_title = '<b>ECS Console:</b> %s' % (str(id))
247 |             contextual_links += get_dashboard_button(ecs_service_title , ecs_service_link) 
248 |             
249 |             # Describe task definition to get log groups
250 |             try:
251 |                 response = ecs.describe_task_definition(taskDefinition=response['services'][0]['taskDefinition'],include=['TAGS',])
252 |             except botocore.exceptions.ClientError as error:
253 |                 logger.exception("Error describing ECS task definition")
254 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
255 |             except botocore.exceptions.ParamValidationError as error:
256 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))              
257 |             
258 |             log_inputs = []
259 |             for container_definition in response['taskDefinition']['containerDefinitions']:
260 |                 if container_definition['logConfiguration']['logDriver'] == "awslogs":
261 |                     log_input = {"logGroupName": container_definition['logConfiguration']['options']['awslogs-group']}
262 |                     log_inputs.append(log_input)
263 |                     log_information, log_events =  get_last_10_events(log_input, change_time, region) 
264 |                     
265 |             # Log Insights Link
266 |             log_insights_query = """fields @timestamp, @message
267 |                 | sort @timestamp desc
268 |                 | limit 100"""
269 |             log_insights_link = get_log_insights_link(log_inputs, log_insights_query, region, start_time, end_time)
270 |             contextual_links += get_dashboard_button("Log Insights" , log_insights_link)                         
271 | 
272 |             dashboard_metrics = [
273 |                 {
274 |                     "title": id + " CPU Utilization",
275 |                     "view": "timeSeries",
276 |                     "stacked": False,
277 |                     "stat": "Average",
278 |                     "period": 60,
279 |                     "metrics": [
280 |                          [ "AWS/ECS", "CPUUtilization", "ClusterName", cluster_name, "ServiceName", id ]
281 |                     ]
282 |                 },
283 |                 {
284 |                     "title": id + " Memory Utilization",
285 |                     "view": "timeSeries",
286 |                     "stacked": False,
287 |                     "stat": "Average",
288 |                     "period": 60,
289 |                     "metrics": [
290 |                         [ "AWS/ECS", "MemoryUtilization", "ClusterName", cluster_name, "ServiceName", id ]
291 |                     ]
292 |                 }
293 |             ]
294 |             widget_images.extend(build_dashboard(dashboard_metrics, annotation_time, start, end, region))
295 |             additional_metrics_with_timestamps_removed.extend(get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region))
296 | 
297 |             # Get Trace information
298 |             # This will only work if the specified service name for X-Ray is the same as the ECS service name.
299 |             filter_expression = f'!OK and service(id(name: "{id}", type: "AWS::ECS::Container")) AND service(id(account.id: "{account_id}"))'
300 |             logger.info("X-Ray Filter Expression", filter_expression=filter_expression)
301 |             trace_summary, trace = process_traces(filter_expression, region, start_time, end_time)
302 | 
303 | 
304 |         else:
305 |             contextual_links = None
306 |             log_information = None
307 |             log_events = None
308 |             resource_information = None
309 |             resource_information_object = None
310 |             widget_images = None
311 |             additional_metrics_with_timestamps_removed = None
312 |             trace_summary = None
313 |             trace = None
314 |             notifications = None
315 | 
316 |     
317 |     return {
318 |         "contextual_links": contextual_links,
319 |         "log_information": log_information,
320 |         "log_events": log_events,
321 |         "resource_information": resource_information,
322 |         "resource_information_object": resource_information_object,
323 |         "notifications": None,
324 |         "widget_images": widget_images,
325 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
326 |         "trace_summary": None,
327 |         "trace": None
328 |     }


--------------------------------------------------------------------------------
/alarm_context_tool/functions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import datetime
  3 | import pandas as pd
  4 | 
  5 | from aws_lambda_powertools import Logger
  6 | from aws_lambda_powertools import Tracer
  7 | logger = Logger()
  8 | tracer = Tracer()
  9 | 
 10 | @tracer.capture_method
 11 | def json_serial(obj):
 12 |     """
 13 |     JSON serializer for objects not serializable by default json code
 14 |     """
 15 |     if isinstance(obj, datetime.datetime):
 16 |         return obj.isoformat()
 17 |     raise TypeError("Type not serializable")
 18 | 
 19 | @tracer.capture_method
 20 | def create_test_case(event):
 21 |     # Extract the relevant SNS message part of the event
 22 |     sns_message = event['Records'][0]['Sns']
 23 |     message_dict = sns_message['Message']
 24 |     
 25 |     # Construct the test_case using the extracted data
 26 |     test_case = {
 27 |         "Records": [
 28 |             {
 29 |                 "EventSource": sns_message.get("EventSource", "aws:sns"),
 30 |                 "EventVersion": sns_message.get("EventVersion", "1.0"),
 31 |                 "EventSubscriptionArn": sns_message.get("EventSubscriptionArn", ""),
 32 |                 "Sns": {
 33 |                     "Type": sns_message.get("Type", "Notification"),
 34 |                     "MessageId": sns_message.get("MessageId", ""),
 35 |                     "TopicArn": sns_message.get("TopicArn", ""),
 36 |                     "Subject": sns_message.get("Subject", ""),
 37 |                     "Message": message_dict, 
 38 |                     "Timestamp": sns_message.get("Timestamp", "default_timestamp"),
 39 |                     "SignatureVersion": sns_message.get("SignatureVersion", "1"),
 40 |                     "Signature": sns_message.get("Signature", ""),
 41 |                     "SigningCertUrl": sns_message.get("SigningCertUrl", ""),
 42 |                     "UnsubscribeUrl": sns_message.get("UnsubscribeUrl", ""),
 43 |                     "MessageAttributes": sns_message.get("MessageAttributes", {})
 44 |                 }
 45 |             }
 46 |         ]
 47 |     }    
 48 |     return test_case
 49 | 
 50 | @tracer.capture_method
 51 | def is_json(test_str):
 52 |     """
 53 |     This function checks if a given string is a valid JSON object by trying to parse it. If parsing is successful, it
 54 |     then checks if the string starts with '{' as this is the expected start of a JSON object.
 55 | 
 56 |     Args:
 57 |         test_str (str): A string to check if it is a valid JSON object.
 58 | 
 59 |     Returns:
 60 |         bool: True if the string is a valid JSON object, False otherwise.
 61 |     """    
 62 |     try:
 63 |         json.loads(test_str)
 64 |     except ValueError as e:
 65 |         return False
 66 |     if test_str[:1] == "{":
 67 |         return True
 68 |     else:
 69 |         return False
 70 |         
 71 | @tracer.capture_method
 72 | def get_dashboard_button(button_title, button_link):
 73 |     """
 74 |     Returns an HTML-formatted button element with the given title and link.
 75 | 
 76 |     Parameters:
 77 |     button_title (str): The text to display on the button.
 78 |     button_link (str): The URL to link to when the button is clicked.
 79 | 
 80 |     Returns:
 81 |     str: An HTML-formatted button element.
 82 |     """    
 83 |     dashboard_button  = '<a rel="noopener" target="_blank" href="%s" style="margin-right: 10px; margin-bottom:10px; background-color: #ff9900; background-image: linear-gradient(#ff9900,#ff9900); font-size: 13px; font-family: Helvetica, Arial, sans-serif; font-weight: 700; text-decoration: none; padding: 9px 9px; color: rgb(255, 255, 255) !important; -webkit-text-fill-color: rgb(255, 255, 255) !important; border-radius: 2px; display: inline-block; mso-padding-alt: 0;">' % (button_link)
 84 |     dashboard_button += '   <!--[if mso]>'
 85 |     dashboard_button += '   <i style="letter-spacing: 25px; mso-font-width: -100%; mso-text-raise: 30pt;">&nbsp;</i>'
 86 |     dashboard_button += '   <![endif]-->'
 87 |     dashboard_button += '   <span style="mso-text-raise: 15pt;">%s <img style="margin-bottom: -4px;" src="cid:imageId2"></span>' % (button_title)
 88 |     dashboard_button += '   <!--[if mso]>'
 89 |     dashboard_button += '   <i style="letter-spacing: 25px; mso-font-width: -100%;">&nbsp;</i>'
 90 |     dashboard_button += '   <![endif]-->'
 91 |     dashboard_button += '</a>'
 92 |     return dashboard_button
 93 |     
 94 | @tracer.capture_method
 95 | def get_information_panel(panel_title, panel_content):
 96 |     """
 97 |     Returns an HTML table formatted as an information panel with a title and content.
 98 | 
 99 |     Parameters:
100 |         panel_title (str): The title of the information panel.
101 |         panel_content (str): The content to be displayed in the information panel.
102 | 
103 |     Returns:
104 |         str: An HTML table formatted as an information panel with a title and content.
105 |     """    
106 |     information_panel  = '<table style="border-radius: 2px; margin-bottom:10px;" cellpadding="9" cellspacing="0" width="100%" align="center" border="0">'
107 |     information_panel += '   <tr>'
108 |     information_panel += '      <td style="background-color: #003181; background-image: linear-gradient(#003181,#003181); color: rgb(255, 255, 255) !important; -webkit-text-fill-color: rgb(255, 255, 255) !important" rowspan="2">&#8505;</td>'    
109 |     information_panel += '      <td style="background-color: #2074d5; background-image: linear-gradient(#2074d5,#2074d5); color: rgb(255, 255, 255) !important; -webkit-text-fill-color: rgb(255, 255, 255) !important"><b>%s</b></td>' % (panel_title)
110 |     information_panel += '   </tr>'
111 |     information_panel += '   <tr>'
112 |     information_panel +=        '<td style="background-color: #2074d5; background-image: linear-gradient(#2074d5,#2074d5); color: rgb(255, 255, 255) !important; -webkit-text-fill-color: rgb(255, 255, 255) !important">%s</td>' % (panel_content)
113 |     information_panel += '   </tr>'
114 |     information_panel += '</table>'
115 |     return information_panel    
116 |     
117 | @tracer.capture_method
118 | def get_html_table_with_fields(title, items_list, fields=None):
119 |     """
120 |     Returns an HTML table with the specified title and items_list.
121 | 
122 |     Parameters:
123 |     title (str): Title of the table.
124 |     items_list (list): List of dictionaries containing the data to populate the table.
125 |     fields (list): List of fields to display in the table. If None, all fields from the first item are displayed.
126 | 
127 |     Returns:
128 |     str: HTML table as a string.
129 |     """
130 |     # Determine fields from the first item if not explicitly provided
131 |     if not fields and items_list:
132 |         fields = list(items_list[0].keys())
133 | 
134 |     # Define table header and CSS styles
135 |     html_table = f'<table id="info" width="640" style="word-wrap: anywhere; max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
136 |     html_table += f'<tr><td colspan="{len(fields)}" style="text-align:center;"><b>{title}</b></td></tr>'
137 | 
138 |     # Add table headers
139 |     html_table += '<tr>' + ''.join(f'<th>{field}</th>' for field in fields) + '</tr>'
140 | 
141 |     # Add table rows
142 |     for item in items_list:
143 |         html_table += '<tr>' + ''.join(f'<td>{item.get(field, "")}</td>' for field in fields) + '</tr>'
144 | 
145 |     html_table += '</table>'
146 |     return html_table
147 | 
148 | @tracer.capture_method
149 | def get_html_table(title, items_dict):
150 |     """
151 |     Returns an HTML table with the specified title and items_dict.
152 | 
153 |     Parameters:
154 |     title (str): Title of the table.
155 |     items_dict (dict): Dictionary containing the data to populate the table.
156 | 
157 |     Returns:
158 |     str: HTML table as a string.
159 | 
160 |     """    
161 |     html_table  = '<table id="info" width="640" style="word-wrap: anywhere; max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
162 |     html_table += '<tr><td colspan="3"><center><b>%s</b></center></td></tr>' % (title)
163 |     for key, value in items_dict.items():
164 |         if type(value) == list:
165 |             if len(value) > 0:
166 |                 html_items = ""
167 |                 i = 0
168 |                 for items in value:
169 |                     i += 1
170 |                     if len(items) == 2:
171 |                         if i == 1:
172 |                             html_table += '<tr><td id="0" rowspan="%s"><b>%s</b></td>' % (len(value)+1,key)
173 |                         if type(items) == dict:
174 |                             html_items += '<tr>'
175 |                             for item_key, item_value in items.items():
176 |                                 if type(item_value) == dict:
177 |                                     if i == 1:
178 |                                         html_table += '<td id="01"><b>%s</b></td>' % (item_key)                                    
179 |                                     html_items += '<td id="1" style="word-wrap: break-all;">'
180 |                                     for sub_value_key, sub_value_value in item_value.items():
181 |                                         html_items += "<b>%s</b>: %s<br>" % (sub_value_key, sub_value_value) 
182 |                                     html_items += "</td>"
183 |                                 elif type(item_value) in [str, int, float, datetime.datetime] and item_value:
184 |                                     if i == 1:
185 |                                         html_table += '<td id="2" style="word-wrap: break-all;"><b>%s</b></td>' % (item_key)
186 |                                     if type (item_value) == datetime.datetime:
187 |                                         item_value = item_value.strftime("%a %d %b, %Y %H:%M:%S %Z") 
188 |                                     if type (item_value) == str:
189 |                                         if(is_json(item_value)):
190 |                                             parsed_json = json.loads(item_value)
191 |                                             item_value = json.dumps(parsed_json, indent=2)  
192 |                                             item_value = item_value.replace('\n', '<br>')
193 |                                             item_value = item_value.replace(' ', '&nbsp;')
194 |                                             item_value = '<pre style="overflow-x: auto; white-space: pre-wrap; white-space: -moz-pre-wrap; white-space: -pre-wrap; white-space: -o-pre-wrap;  word-wrap: break-word; ">' +item_value +"</pre>"
195 |                                     html_items += '<td id="3" style="word-wrap: break-all;">%s</td>' % (item_value)                                
196 |                                 else:
197 |                                     html_items += '<tr><td id="4" colspan="2">&nbsp;</td></tr>'
198 |                             html_items += '</tr>'
199 |                         elif type(items) in [str, int, float, datetime.datetime] and items:
200 |                             if type (items) == datetime.datetime:
201 |                                 items = items.strftime("%a %d %b, %Y %H:%M:%S %Z")
202 |                             if type (items) == str:
203 |                                 if(is_json(items)):
204 |                                     parsed_json = json.loads(items)
205 |                                     items = json.dumps(parsed_json, indent=2)  
206 |                                     items = items.replace('\n', '<br>')
207 |                                     items = items.replace(' ', '&nbsp;')  
208 |                                     items = '<pre style="overflow-x: auto; white-space: pre-wrap; white-space: -moz-pre-wrap; white-space: -pre-wrap; white-space: -o-pre-wrap;  word-wrap: break-word;">' +items +"</pre>"
209 |                             html_items += '<tr><td id="5" colspan="2">%s</td></tr>'  % (items) 
210 |                         else:
211 |                             html_items += '<tr><td id="6" colspan="2">&nbsp;</td></tr>'
212 |                 html_items += '</tr>'
213 |                 html_table += '</tr>'
214 |                 html_table += html_items
215 |         elif type(value) == dict:  
216 |             i = 0
217 |             html_items = ""
218 |             for sub_key, sub_value in value.items():
219 |                 if type(sub_value) in [str, int, float, datetime.datetime] and sub_value:
220 |                     i += 1
221 |                     if type (sub_value) == datetime.datetime:
222 |                         sub_value = sub_value.strftime("%a %d %b, %Y %H:%M:%S %Z")
223 |                     if type (sub_value) == str:
224 |                         if(is_json(sub_value)):
225 |                             parsed_json = json.loads(sub_value)
226 |                             sub_value = json.dumps(parsed_json, indent=2)  
227 |                             sub_value = sub_value.replace('\n', '<br>')
228 |                             sub_value = sub_value.replace(' ', '&nbsp;')   
229 |                             sub_value = '<pre style="overflow-x: auto; white-space: pre-wrap; white-space: -moz-pre-wrap; white-space: -pre-wrap; white-space: -o-pre-wrap;  word-wrap: break-word;">' +sub_value +"</pre>"
230 |                     if i > 1:
231 |                         html_items += '<tr>'
232 |                     html_items += '<td id="7" style="word-wrap: break-all;"><b>%s</b></td><td id="8" style="word-wrap: break-all;">%s</td></tr>'  % (sub_key, sub_value)   
233 |             if i > 0:
234 |                 html_table += '<tr><td id="9" rowspan="%s"><b>%s</b></td>' % (i,key)
235 |                 html_table += html_items
236 |                 #html_table += '</tr>'               
237 |         elif type(value) in [str, int, float, datetime.datetime] and value:
238 |             if type (value) == datetime.datetime:
239 |                 value = value.strftime("%A %d %B, %Y %H:%M:%S %Z")   
240 |             if type (value) == str:    
241 |                 if(is_json(value)):
242 |                     parsed_json = json.loads(value)
243 |                     value = json.dumps(parsed_json, indent=2)  
244 |                     value = value.replace('\n', '<br>')
245 |                     value = value.replace(' ', '&nbsp;')    
246 |                     value = '<pre style="overflow-x: auto; white-space: pre-wrap; white-space: -moz-pre-wrap; white-space: -pre-wrap; white-space: -o-pre-wrap;  word-wrap: break-word;">' +value +"</pre>"
247 |             html_table += '<tr><td id="10"><b>%s</b></td><td id="11" colspan="2" style="word-wrap: break-all;">%s</td></tr>'  % (key, value)
248 |     html_table += "</table>"
249 |     return html_table


--------------------------------------------------------------------------------
/alarm_context_tool/functions_alarm.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import botocore
 3 | from aws_lambda_powertools import Logger
 4 | from aws_lambda_powertools import Tracer
 5 | 
 6 | logger = Logger()
 7 | tracer = Tracer()
 8 | 
 9 | @tracer.capture_method
10 | def get_alarm_history(region, alarm_name):
11 |     """
12 |     Retrieves the alarm history for the given alarm name and region.
13 |     
14 |     Args:
15 |         region (str): The AWS region where the alarm is located.
16 |         alarm_name (str): The name of the alarm to retrieve the history for.
17 |     
18 |     Returns:
19 |         str: The alarm history in string format.
20 |     """
21 |     cloudwatch = boto3.client('cloudwatch', region_name=region)
22 |     try:
23 |         paginator = cloudwatch.get_paginator('describe_alarm_history')
24 |         alarm_history_items = []
25 |         for page in paginator.paginate(
26 |             AlarmName=alarm_name,
27 |             HistoryItemType='StateUpdate',
28 |             ScanBy='TimestampDescending',
29 |             PaginationConfig={
30 |                 'MaxItems': 10,
31 |                 'PageSize': 10
32 |             }            
33 |         ):
34 |             alarm_history_items.extend(page['AlarmHistoryItems'])
35 |         response = {'AlarmHistoryItems': alarm_history_items}        
36 |     except botocore.exceptions.ClientError as error:
37 |         logger.exception("Error getting alarm history data")
38 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
39 |     except botocore.exceptions.ParamValidationError as error:
40 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error))      
41 |     logger.info("Alarm History" , extra=response)
42 | 
43 |     for AlarmHistoryItem in response.get('AlarmHistoryItems', []):
44 |         AlarmHistoryItem.pop('AlarmName', None)
45 |         AlarmHistoryItem.pop('AlarmType', None)
46 |         AlarmHistoryItem.pop('HistoryData', None)
47 |         AlarmHistoryItem.pop('HistoryItemType', None) 
48 |     alarm_history = str(response) 
49 |     return alarm_history   


--------------------------------------------------------------------------------
/alarm_context_tool/functions_bedrock.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import json
  3 | import os
  4 | import botocore
  5 | 
  6 | from functions import get_information_panel
  7 | 
  8 | from aws_lambda_powertools import Logger
  9 | from aws_lambda_powertools import Tracer
 10 | 
 11 | logger = Logger()
 12 | tracer = Tracer()
 13 | 
 14 | @tracer.capture_method
 15 | def build_prompt_start():
 16 |     return '''
 17 |     The alarm message is contained in the <message> tag.
 18 |     
 19 |     Summarize the trigger for the alarm based on the metric and provide possible root causes and links to aws documentation that might help fix it. 
 20 |     Use the alarm history in the <alarm_history> tags to understand the frequency of the alarm and describe this to the reader.
 21 |     Using all of the available data, describe to the reader your interpretation of the immediacy that action is required to address the root cause.
 22 |     The response needs to be in HTML format, maximum header size should be h3. 
 23 |     Add headers to make the response more readable.
 24 | 
 25 |     '''
 26 | 
 27 | @tracer.capture_method
 28 | def build_section(instructions, tag_name, information):
 29 |     return f'''
 30 |     {instructions}
 31 |     <{tag_name}>
 32 |     {information}
 33 |     </{tag_name}>
 34 |     '''
 35 | 
 36 | @tracer.capture_method
 37 | def build_prompt_end():
 38 |     return '''
 39 |     The most important thing is to try to identify the root cause of potential issues with the information that you have.
 40 |     The actual values of the metrics in the <metric_data> tag should override the AlarmDescription in the <message> tag if there is a discrepancy
 41 |     The reponse must be in HTML, be structured with headers so its easy to read and include at least 3 links to relevant AWS documentation.
 42 |     Do not include an introductory line or prompt for a follow up. 
 43 |     If <cloudformation_template> exists, attempt to highlight a fix via changing the template in JSON format, presented in HTML, make the code change stand out.
 44 |     '''
 45 | 
 46 | @tracer.capture_method
 47 | def construct_prompt(alarm_history, message, metric_data, text_summary, health_events, truncated_cloudformation_template, resource_information_object, log_events, additional_metrics_with_timestamps_removed, trace_summary):
 48 |     prompt = build_prompt_start()
 49 |     
 50 |     # Add sections dynamically based on content
 51 |     if alarm_history:
 52 |         instructions = f'''
 53 | 
 54 |         Alarm history is contained in the <alarm_history> tag. 
 55 |         Use this information to understand the frequency of the alarm and describe this to the reader.
 56 |         '''
 57 |         prompt += build_section(instructions, 'alarm_history', alarm_history)
 58 |     
 59 |     if message:
 60 |         instructions = f'''
 61 | 
 62 |         The CloudWatch alarm message is contained in the <message> tag.
 63 |         '''
 64 |         prompt += build_section(instructions, 'message', message)
 65 |     
 66 |     if metric_data:
 67 |         instructions = f'''
 68 | 
 69 |         Metric data for the metric that triggered the alarm is contained in the <metric_data> tag. The metric will be graphed below your response. 
 70 |         The metric data contains 25 hours of data, comment on the last 24 hours of data and do a comparison with the last hour with the day before at the same time.
 71 |         '''
 72 |         prompt += build_section(instructions, 'metric_data', metric_data)
 73 |     
 74 |     if text_summary:
 75 |         instructions = f'''
 76 | 
 77 |         A human readable message for the alarm is contained in the <text_summary> tag. 
 78 |         The email  to the end user will already contain this summary above your response.
 79 |         '''
 80 |         prompt += build_section(instructions, 'text_summary', text_summary)
 81 |     
 82 |     if health_events:
 83 |         instructions = f'''
 84 | 
 85 |         AWS Health events are contained in the <health_events> tag.
 86 |         See if there are events in <health_events> that may be impacting the resources.
 87 |         Warn the reader if there are upcoming events for related resources.    
 88 |         '''
 89 |         prompt += build_section(instructions, 'health_events', health_events)
 90 |     
 91 |     if truncated_cloudformation_template:
 92 |         instructions = f'''
 93 | 
 94 |         The CloudFormation template used to create this resource is in the <truncated_cloudformation_template> tag.
 95 |         Values have been truncated to minimize token usage.
 96 |         Use the cloudformation_template and if there is a fix that can be made, call it out and tell the reader which code they need to change to resolve the issue.
 97 |         If this is identifiable, it will be the most important information that the reader will want to see.
 98 |         '''
 99 |         prompt += build_section(instructions, 'truncated_cloudformation_template', truncated_cloudformation_template)
100 |     
101 |     if resource_information_object:
102 |         instructions = f'''
103 | 
104 |         Information about the resource related to the metric is contained in the <resource_information_object> tag.
105 |         Use the resource_information_object as additional context, but also summarize or highlight any relevant data as well.
106 |         '''
107 |         prompt += build_section(instructions, 'resource_information_object', resource_information_object)                                        
108 |     
109 |     if log_events:
110 |         instructions = f'''
111 | 
112 |         If there are any relevant logs, the last 10 log events will be contained within the <log_events> tag.
113 |         '''
114 |         prompt += build_section(instructions, 'log_events', log_events)   
115 |     
116 |     if additional_metrics_with_timestamps_removed:
117 |         instructions = f'''
118 | 
119 |         Also use related metrics contained in the <additional_metrics> tag they are from 60 minutes before the time of the alarm. They have had the timestamps removed. 
120 |         Comment on each of the additional_metrics and it's relevance to the root cause.
121 |         '''
122 |         prompt += build_section(instructions, 'additional_metrics_with_timestamps_removed', additional_metrics_with_timestamps_removed)   
123 |     
124 |     if trace_summary:
125 |         instructions = f'''
126 | 
127 |         Also use the following trace summary contained in the <trace_summary> tag, it's likely to be the best source of information.
128 |         Comment on how the trace_summary shows the potential root cause. 
129 |         Do not output the trace to the reader in JSON format, if you quote it, it must be in human readable format.
130 |         When correlating the trace data with the alarm and metrics, be mindful that the trace may not have occurred at the same time as the alarm.
131 |         If necessary, explain that the trace may not have occurred at the same time as the alarm and any root cause may be correlated.
132 |         '''
133 |         prompt += build_section(instructions, 'trace_summary', trace_summary)   
134 |                             
135 |     prompt += build_prompt_end()
136 |     return prompt
137 | 
138 | @tracer.capture_method
139 | def execute_prompt(prompt):
140 |     if os.environ.get('USE_BEDROCK'):
141 |         model_name = os.environ.get('BEDROCK_MODEL_ID').split('.')[1].split('-v')[0].capitalize()
142 |         bedrock = boto3.client(service_name="bedrock-runtime",region_name=os.environ.get('BEDROCK_REGION'))
143 |         system_prompt = "You are a devops engineer providing guidance about how to do root cause analysis. Your response will be displayed in an email to a user where a CloudWatch alarm has been triggered."
144 |         max_tokens = int(os.environ.get('BEDROCK_MAX_TOKENS'))
145 |         user_message =  {"role": "user", "content": prompt}
146 |         messages = [user_message]
147 |         body=json.dumps(
148 |             {
149 |                 "anthropic_version": os.environ.get('ANTHROPIC_VERSION'),
150 |                 "max_tokens": max_tokens,
151 |                 "system": system_prompt,
152 |                 "messages": messages,
153 |                 "temperature": 0.5,
154 |                 "top_k": 250,
155 |                 "top_p": 0.999                
156 |             }  
157 |         )                       
158 |         try:
159 |             response = bedrock.invoke_model(body=body, modelId=os.environ.get('BEDROCK_MODEL_ID'))
160 |         except botocore.exceptions.ClientError as error:
161 |             logger.exception("Error calling Bedrock")
162 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
163 |         except botocore.exceptions.ParamValidationError as error:
164 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error))   
165 |         
166 |         response_body = json.loads(response.get("body").read())
167 |         logger.debug("Bedrock Response", extra=response_body) 
168 |         ai_response = get_information_panel(model_name + " says:", response_body["content"][0]["text"])
169 |     else:
170 |         ai_response = get_information_panel("Bedrock says:", "Bedrock analysis is disabled.")    
171 |     return ai_response
172 | 


--------------------------------------------------------------------------------
/alarm_context_tool/functions_cloudformation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import yaml
  4 | import boto3
  5 | import botocore
  6 | from collections import OrderedDict
  7 | from cfn_flip import to_json
  8 | from datetime import date
  9 | 
 10 | from aws_lambda_powertools import Logger
 11 | from aws_lambda_powertools import Tracer
 12 | 
 13 | logger = Logger()
 14 | tracer = Tracer()
 15 | 
 16 | @tracer.capture_method
 17 | def process_cloudformation_template(cloudformation_template, trace_summary, max_length=100):
 18 |     if 'trace_summary' not in locals() or not trace_summary or 'TraceSummaries' not in trace_summary:
 19 |         # No trace summary or no traces available, return the entire truncated template
 20 |         preprocessed_template = truncate_template(cloudformation_template, max_length)
 21 |         return preprocessed_template
 22 | 
 23 |     fault_root_cause_types = set()
 24 |     error_root_cause_types = set()
 25 | 
 26 |     for trace in trace_summary['TraceSummaries']:
 27 |         fault_root_causes = trace.get('FaultRootCauses', [])
 28 |         fault_root_cause_types.update(get_root_cause_service_types(fault_root_causes))
 29 | 
 30 |         error_root_causes = trace.get('ErrorRootCauses', [])
 31 |         error_root_cause_types.update(get_root_cause_service_types(error_root_causes))
 32 | 
 33 |     combined_root_cause_types = fault_root_cause_types | error_root_cause_types
 34 | 
 35 |     filtered_resources = filter_resources_from_template(cloudformation_template, combined_root_cause_types)
 36 |     if filtered_resources:
 37 |         # If resources are filtered based on root cause types, return the filtered resources
 38 |         preprocessed_template = json.dumps(filtered_resources, indent=2)
 39 |     else:
 40 |         # If no resources are filtered, return the entire truncated template
 41 |         preprocessed_template = truncate_template(cloudformation_template, max_length)
 42 | 
 43 |     return preprocessed_template
 44 | 
 45 | @tracer.capture_method
 46 | def get_root_cause_service_types(root_causes):
 47 |     root_cause_types = set()
 48 | 
 49 |     for root_cause in root_causes:
 50 |         services = root_cause.get('Services', [])
 51 | 
 52 |         for service in services:
 53 |             entity_path = service.get('EntityPath', [])
 54 |             service_type = service.get('Type')
 55 | 
 56 |             if service_type != 'remote':
 57 |                 for entity in entity_path:
 58 |                     if 'Exceptions' in entity and entity['Exceptions']:
 59 |                         root_cause_types.add(service_type)
 60 |                         if entity['Name'] == 'DynamoDB':
 61 |                             root_cause_types.add('AWS::DynamoDB::Table')
 62 | 
 63 |     return root_cause_types
 64 | 
 65 | @tracer.capture_method
 66 | def filter_resources_from_template(template_str, root_cause_types):
 67 |     template_dict = json.loads(template_str)
 68 | 
 69 |     # Filter resources
 70 |     filtered_resources = {}
 71 |     for resource_id, resource_details in template_dict.get('Resources', {}).items():
 72 |         resource_type = resource_details.get('Type')
 73 |         if resource_type in root_cause_types:
 74 |             filtered_resources[resource_id] = resource_details
 75 | 
 76 |     return filtered_resources
 77 | 
 78 | @tracer.capture_method
 79 | def truncate_template(template_str, max_length):
 80 |     template_obj = json.loads(template_str, object_pairs_hook=OrderedDict)
 81 | 
 82 |     # Truncate values in the template object
 83 |     truncated_obj = truncate_values(template_obj, max_length)
 84 | 
 85 |     # Convert the Python object to JSON
 86 |     json_obj = json.loads(json.dumps(truncated_obj, cls=CustomJSONEncoder))
 87 | 
 88 |     # Minify the JSON object
 89 |     truncated_template_str = json.dumps(json_obj, separators=(',', ':'))
 90 | 
 91 |     return truncated_template_str
 92 | 
 93 | 
 94 | @tracer.capture_method
 95 | def remove_comments(template_str):
 96 |     if template_str.strip().startswith('{'):
 97 |         # JSON template
 98 |         pattern = r'//.*?$|/\*(?:.|[\r\n])*?\*/'
 99 |         return re.sub(pattern, '', template_str, flags=re.MULTILINE)
100 |     else:
101 |         # YAML template
102 |         lines = []
103 |         for line in template_str.splitlines():
104 |             if not line.strip().startswith('#'):
105 |                 lines.append(line)
106 |         return '\n'.join(lines)
107 | 
108 | @tracer.capture_method
109 | def truncate_values(obj, max_length=100):
110 |     if isinstance(obj, str):
111 |         return obj[:max_length]
112 |     elif isinstance(obj, dict):
113 |         return {k: truncate_values(v, max_length) for k, v in obj.items()}
114 |     elif isinstance(obj, list):
115 |         return [truncate_values(item, max_length) for item in obj]
116 |     else:
117 |         return obj
118 | 
119 | @tracer.capture_method
120 | class CustomJSONEncoder(json.JSONEncoder):
121 |     def default(self, obj):
122 |         if isinstance(obj, date):
123 |             return obj.isoformat()
124 |         return super().default(obj)
125 | 
126 | @tracer.capture_method
127 | def find_cloudformation_arn(tags):
128 |     cloudformation_arn = None
129 | 
130 |     if isinstance(tags, list):
131 |         for tag in tags:
132 |             if tag['Value'].startswith('arn:aws:cloudformation:'):
133 |                 cloudformation_arn = tag['Value']
134 |                 break  # Exit the loop once found
135 | 
136 |     elif isinstance(tags, dict):
137 |         for key, value in tags.items():
138 |             if value.startswith('arn:aws:cloudformation:'):
139 |                 cloudformation_arn = value
140 |                 break  # Exit the loop once found
141 | 
142 |     return cloudformation_arn
143 | 
144 | @tracer.capture_method
145 | def get_cloudformation_template(tags, region, trace_summary, max_length=100):
146 |     preprocessed_template = None
147 | 
148 |     if not tags:
149 |         logger.info("No tags found or 'Tags' is unassigned.")
150 | 
151 |     cloudformation_arn = find_cloudformation_arn(tags)
152 | 
153 |     if cloudformation_arn:      
154 |         cloudformation = boto3.client('cloudformation', region_name=region)
155 |         try:
156 |             response = cloudformation.get_template(
157 |                 StackName=cloudformation_arn,
158 |                 TemplateStage='Processed'
159 |             )
160 |         except botocore.exceptions.ClientError as error:
161 |             logger.exception("Error getting CloudFormation template")
162 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error
163 |         except botocore.exceptions.ParamValidationError as error:
164 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error))
165 |         
166 |         cloudformation_template = response['TemplateBody']
167 |         logger.info ("Cloudformation Template", extra=response)                 
168 | 
169 |         try:
170 |             # Attempt to treat the string as YAML and convert to JSON
171 |             cloudformation_template = to_json(cloudformation_template)
172 |         except Exception as e:
173 |             # The template is in JSON, conver to string
174 |             cloudformation_template = json.dumps(cloudformation_template)
175 |             
176 |         preprocessed_template = process_cloudformation_template(cloudformation_template, trace_summary, max_length)
177 | 
178 |     return preprocessed_template


--------------------------------------------------------------------------------
/alarm_context_tool/functions_email.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | import os
  4 | import base64
  5 | import urllib.parse
  6 | import markdown  # Make sure to install Markdown if you haven't already
  7 | 
  8 | from email.mime.multipart import MIMEMultipart
  9 | from email.mime.text import MIMEText
 10 | from email.mime.application import MIMEApplication
 11 | 
 12 | from functions import get_information_panel
 13 | from functions import get_dashboard_button
 14 | 
 15 | from aws_lambda_powertools import Logger
 16 | from aws_lambda_powertools import Tracer
 17 | logger = Logger()
 18 | tracer = Tracer()
 19 | 
 20 | @tracer.capture_method
 21 | def get_generic_links(region):
 22 |     """
 23 |     Generates generic links for the AWS Console.
 24 | 
 25 |     Parameters:
 26 |     - region (str): The AWS region code for generating deep links.
 27 | 
 28 |     Returns:
 29 |     - str: HTML formatted links to the AWS Console.
 30 |     """
 31 |     # AWS Console links
 32 |     cross_service_dashboard_link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:cross_service' % (region, region)
 33 |     generic_information = get_dashboard_button("Cross service dashboard", cross_service_dashboard_link)
 34 |     aws_health_dashboard_link = 'https://health.aws.amazon.com/health/home'    
 35 |     generic_information += get_dashboard_button("AWS Health dashboard", aws_health_dashboard_link)
 36 |     return generic_information
 37 | 
 38 | @tracer.capture_method
 39 | def build_email_summary(alarm_name, region_name, new_state, reason, display_change_time, alarm_description, region):
 40 |     """
 41 |     Builds the email summary for a CloudWatch Alarm notification.
 42 | 
 43 |     Parameters:
 44 |     - alarm_name (str): The name of the CloudWatch Alarm.
 45 |     - region_name (str): The AWS region where the alarm is set.
 46 |     - new_state (str): The new state of the alarm (e.g., ALARM, OK).
 47 |     - reason (str): The reason why the alarm changed its state.
 48 |     - display_change_time (str): The time at which the alarm state changed, in a human-readable format.
 49 |     - alarm_description (str): The description of the CloudWatch Alarm.
 50 |     - region (str): The AWS region code for generating the deep link to the alarm.
 51 | 
 52 |     Returns:
 53 |     - str: HTML formatted summary of the alarm notification.
 54 |     """
 55 |     # Message Summary
 56 |     summary  = f'<p>Your Amazon CloudWatch Alarm <b>"{alarm_name}"</b> in the <b>{region_name}</b> region has entered the <b>{new_state}</b> state, because <b>"{reason}"</b> at <b>"{display_change_time}"</b>.<p>'
 57 |     summary += '<style>table#info tr{border:1px solid #232F3E;}  table#info tr:nth-child(even) { background-color:#D4DADA; } table#info tr:nth-child(odd) { background-color:#F1F3F3; }</style>'
 58 |     
 59 |     if not alarm_description:
 60 |         panel_title = "Your alarm has no description."
 61 |         panel_content = "Use alarm descriptions to add context and links to your alarms using markdown."
 62 |         summary += get_information_panel(panel_title, panel_content)
 63 |     else:
 64 |         summary += '<table id="info" style="max-width:640px; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="640" align="center" border="0">'    
 65 |         summary += '<tr><td><center><b>Alarm Description</b></center></td></tr><tr><td>'
 66 |         summary += markdown.markdown(alarm_description)
 67 |         summary += '</td></tr></table>'
 68 | 
 69 |     encoded_alarm_name = urllib.parse.quote_plus(alarm_name)
 70 |     alarm_link = f'https://{region}.console.aws.amazon.com/cloudwatch/deeplink.js?region={region}#alarmsV2:alarm/{encoded_alarm_name}'
 71 |     summary += get_dashboard_button("View this alarm in the AWS Management Console", alarm_link)
 72 | 
 73 |     return summary
 74 | 
 75 | @tracer.capture_method
 76 | def send_email(sender, recipient, subject, body_text, body_html, attachments=None, charset="UTF-8"):
 77 |     """
 78 |     Send an email using AWS SES.
 79 |     
 80 |     Parameters:
 81 |     - sender (str): Email address of the sender.
 82 |     - recipient (str): Email address of the recipient.
 83 |     - subject (str): Subject line of the email.
 84 |     - body_text (str): Plain text body of the email.
 85 |     - body_html (str): HTML body of the email.
 86 |     - attachments (list of dicts): Files to attach to the email. Each dict must have 'filename' and 'data' keys.
 87 |     - charset (str): Character set for the text encoding.
 88 |     """   
 89 |     # Create a multipart/mixed parent container.
 90 |     msg = MIMEMultipart('mixed')
 91 |     msg['Subject'] = subject
 92 |     msg['From'] = sender
 93 |     msg['To'] = recipient
 94 | 
 95 |     # Create a multipart/alternative part for the text and HTML content.
 96 |     msg_body = MIMEMultipart('alternative')
 97 |     text_part = MIMEText(body_text, 'plain', charset)
 98 |     html_part = MIMEText(body_html, 'html', charset)
 99 |     msg_body.attach(text_part)
100 |     msg_body.attach(html_part)
101 | 
102 |     # Attach the multipart/alternative part to the message container.
103 |     msg.attach(msg_body)
104 |     
105 |     # Attach any files to the message.
106 |     if attachments:
107 |         for attachment in attachments:
108 |             part = MIMEApplication(attachment['data'])
109 |             part.add_header('Content-Disposition', 'attachment', filename=attachment['filename'])
110 |             part.add_header('Content-ID', attachment['id'])  
111 |             msg.attach(part)
112 | 
113 |     # Send the email
114 |     try:
115 |         ses = boto3.client('ses', region_name=os.environ['AWS_REGION'])
116 |         response = ses.send_raw_email(Source=sender, Destinations=[recipient], RawMessage={'Data': msg.as_string()})
117 |         print("Email Sent", response['MessageId'])
118 |     except botocore.exceptions.ClientError as error:
119 |         logger.exception("Error Sending Email")
120 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error
121 |     except botocore.exceptions.ParamValidationError as error:
122 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error))        
123 | 
124 | @tracer.capture_method
125 | def build_html_body(subject, summary, ai_response, widget_images, trace_html, additional_information, alarm_details, metric_details):
126 |     
127 |     spacer_row = '<tr><td></td><td width="100%" style="text-align:left; line-height: 10px;">&nbsp;</td><td></td></tr>'
128 | 
129 |     BODY_HTML = '''
130 |     <!DOCTYPE htmlPUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
131 |     <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
132 |     '''
133 | 
134 |     # Head
135 |     BODY_HTML += f'''
136 |         <head>
137 |             <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
138 |             <meta http-equiv="X-UA-Compatible" content="IE=edge">
139 |             <meta name="viewport" content="width=device-width, initial-scale=1.0">
140 |             </style>            
141 |             <title>{subject}</title>
142 |         </head>
143 |     '''
144 | 
145 |     # Body and containing table
146 |     BODY_HTML += '''
147 |         <body>
148 |             <center>
149 |                 <table style="word-wrap: break-all; width:100%;max-width:640px;margin: 0 auto;" width="100%" width="640" cellpadding="0" cellspacing="0" border="0">
150 |     '''
151 |     
152 |     # Title
153 |     BODY_HTML += f'''
154 |                     <tr><td></td><td width="640" style="max-width:640px; padding:9px; color: rgb(255, 255, 255) !important; -webkit-text-fill-color: rgb(255, 255, 255) !important; margin-bottom:10px; text-align:left; background: rgb(35,47,62); background: linear-gradient(135deg, rgba(35,47,62,1) 0%, rgba(0,49,129,1) 25%, rgba(0,49,129,1) 50%, rgba(32,116,213,1) 90%, rgba(255,153,0,1) 100%);">
155 |                     {subject}</td><td></td></tr>
156 |     '''
157 | 
158 |     BODY_HTML += spacer_row
159 | 
160 |     # Summary
161 |     BODY_HTML += f'<tr><td></td><td width="640" style="max-width:640px; text-align:left;">{summary}</td><td></td></tr>'
162 | 
163 |     # AI Response
164 |     BODY_HTML += f'<tr><td></td><td width="640" style="max-width:640px; text-align:left;">{ai_response}</td><td></td></tr>'
165 | 
166 |     # Main Widget
167 |     BODY_HTML += '<tr><td></td><td width="640" style="max-width:640px; text-align:left; background-color: #ffffff; background-image: linear-gradient(#ffffff,#ffffff);"><center><img style="margin-bottom:10px;" src="cid:imageId"></center></td><td></td></tr>'
168 |     
169 |     if widget_images:
170 |         BODY_HTML += '<tr><td></td><td width="100%" style="max-width: 640px !important; text-align:left; background-color: #ffffff; background-image: linear-gradient(#ffffff,#ffffff);">'
171 |         BODY_HTML += '<center><table style="max-width: 640px !important;" width="640">'
172 | 
173 |         # Directly iterating in chunks of 2
174 |         for i in range(0, len(widget_images), 2):
175 |             row = widget_images[i:i+2]  # Get slice for the current row
176 |             BODY_HTML += '<tr>'
177 |             for widget_image in row:
178 |                 image_id = widget_image["widget"].replace(" ", "_")  # Assuming this forms your Content-ID
179 |                 if isinstance(widget_image['data'], bytes):
180 |                     BODY_HTML += f'<td style="max-width: 320px !important;" width="320"><img style="margin-bottom:10px;" src="cid:{image_id}"></td>'
181 |                 elif isinstance(widget_image['data'], str):
182 |                     BODY_HTML += f'<td valign="top" style="vertical-align-top; max-width: 320px !important;" width="320">{widget_image["data"]}</td>'
183 |             BODY_HTML += '</tr>'
184 | 
185 |         BODY_HTML += '</table></center>'
186 |         BODY_HTML += '</td><td></td></tr>'    
187 | 
188 |     # Traces
189 |     if trace_html:
190 |         BODY_HTML += spacer_row
191 |         BODY_HTML += f'<tr><td></td><td width="640" style="text-align:left;">{trace_html}</td><td></td></tr>'
192 |     
193 |     BODY_HTML += spacer_row
194 | 
195 |     # Additional Information
196 |     BODY_HTML += f'''   
197 |                     <tr><td></td><td width="640" style="text-align:left;">
198 |                     <table cellpadding="0" cellspacing="0" border="0" style="padding:0px;margin:0px;width:100%;">
199 |                         <tr><td colspan="3" style="padding:0px;margin:0px;font-size:20px;height:20px;" height="20">&nbsp;</td></tr>
200 |                         <tr>
201 |                             <td style="padding:0px;margin:0px;">&nbsp;</td>
202 |                             <td style="padding:0px;margin:0px;" width="640">{additional_information}</td>
203 |                             <td style="padding:0px;margin:0px;">&nbsp;</td>
204 |                         </tr>
205 |                         <tr><td colspan="3" style="padding:0px;margin:0px;max-width: 640px !important;" height="20">&nbsp;</td></tr>
206 |                     </table>
207 |                     </td><td></td></tr> 
208 |     '''
209 | 
210 |     # Alarm Details
211 |     BODY_HTML += spacer_row
212 |     BODY_HTML += f'<tr><td></td><td width="640" style="text-align:left;">{alarm_details}</td><td></td></tr>'
213 | 
214 |     # Metric Details
215 |     BODY_HTML += spacer_row
216 |     BODY_HTML += f'<tr><td></td><td width="640" style="text-align:left;">{metric_details}</td><td></td></tr>'
217 | 
218 |     # End body, containing table and HTML
219 |     BODY_HTML += '''
220 |                 </table>
221 |             </center>
222 |         </body>
223 |     </html>                    
224 |     ''' 
225 | 
226 |     return BODY_HTML
227 | 
228 | 


--------------------------------------------------------------------------------
/alarm_context_tool/functions_health.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import botocore
 3 | 
 4 | from  health_client import HealthClient
 5 | 
 6 | from aws_lambda_powertools import Logger
 7 | from aws_lambda_powertools import Tracer
 8 | logger = Logger()
 9 | tracer = Tracer()
10 | 
11 | # AWS Health Event Details
12 | @tracer.capture_method
13 | def event_details(event_arns):
14 |     event_descriptions = {}
15 |     batch_size = 10
16 |     batches = [event_arns[i:i + batch_size] for i in range(0, len(event_arns), batch_size)]
17 | 
18 |     for batch in batches:
19 |         event_details_response = HealthClient.client().describe_event_details(eventArns=batch)
20 |         for event_details in event_details_response['successfulSet']:
21 |             event_arn = event_details['event']['arn']
22 |             event_description = event_details['eventDescription']['latestDescription']
23 |             event_descriptions[event_arn] = event_description
24 | 
25 |     return event_descriptions
26 | 
27 | # AWS Health Events
28 | @tracer.capture_method
29 | def describe_events(region):
30 |     events_paginator = HealthClient.client().get_paginator('describe_events')
31 | 
32 |     try:
33 |         events_pages = events_paginator.paginate(filter={
34 |             'startTimes': [
35 |                 {
36 |                     'from': datetime.datetime.now() - datetime.timedelta(days=7)
37 |                 }
38 |             ],
39 |             'regions': [
40 |                 region,
41 |             ],
42 |             'eventStatusCodes': ['open', 'upcoming']
43 |         })
44 | 
45 |         event_arns = []
46 |         for events_page in events_pages:
47 |             for event in events_page['events']:
48 |                 event_arns.append(event['arn'])
49 | 
50 | 
51 |     except botocore.exceptions.ClientError as error:
52 |         error_code = error.response['Error']['Code']
53 |         if error_code == 'SubscriptionRequiredException':
54 |             logger.warning("You need a Business, Enterprise On-Ramp, or Enterprise Support plan from AWS Support to use this operation. Skipping health events.")
55 |             return {}
56 |         else:
57 |             logger.exception("Error describing health Events")
58 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error
59 |     except botocore.exceptions.ParamValidationError as error:
60 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error))
61 |     else:
62 |         if event_arns:
63 |             event_descriptions = event_details(event_arns)         
64 |             return event_descriptions
65 |         else:
66 |             logger.info('There are no AWS Health events that match the given filters')
67 |             return {}


--------------------------------------------------------------------------------
/alarm_context_tool/functions_logs.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | import datetime
  5 | from datetime import timedelta
  6 | import time
  7 | import urllib.parse
  8 | import pandas as pd
  9 | 
 10 | from aws_lambda_powertools import Logger
 11 | from aws_lambda_powertools import Tracer
 12 | logger = Logger()
 13 | tracer = Tracer()
 14 | 
 15 | @tracer.capture_method
 16 | def get_log_insights_link(log_input, log_insights_query, region, start_time, end_time):
 17 |     """
 18 |     Generates a link to a CloudWatch Logs Insights query with the specified query, time range and log input.
 19 | 
 20 |     Args:
 21 |     - log_input: A dictionary or list containing the log group name or log stream name.
 22 |     - log_insights_query: The query to execute on the logs.
 23 |     - region: The AWS region of the logs.
 24 |     - start_time: The start time of the query, in ISO format with timezone information.
 25 |     - end_time: The end time of the query, in ISO format with timezone information.
 26 | 
 27 |     Returns:
 28 |     - A link to the CloudWatch Logs Insights query with the specified parameters.
 29 |     """    
 30 |     # convert back to string with required format
 31 |     end_time_str = str(datetime.datetime.strptime(end_time,'%Y-%m-%dT%H:%M:%S.%f%z').strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3]) +"Z"
 32 |     start_time_str = str(datetime.datetime.strptime(start_time,'%Y-%m-%dT%H:%M:%S.%f%z').strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3]) +"Z"                
 33 |                 
 34 |     if isinstance(log_input, list):
 35 |         log_groups = []
 36 |         log_insights_log_groups = ''
 37 |         for log_dict in log_input:
 38 |             if 'logGroupName' in log_dict:
 39 |                 log_group_name = log_dict['logGroupName']            
 40 |                 log_insights_log_groups += "~'"
 41 |                 log_insights_log_groups += urllib.parse.quote_plus(log_group_name)                
 42 |     elif isinstance(log_input, dict):
 43 |         if 'logStreamName' in log_input:
 44 |             log_stream_name = log_input['logStreamName']
 45 |             log_groups = search_log_groups(log_stream_name, region)
 46 |             log_insights_log_groups = ''
 47 |             for log_group in log_groups:
 48 |                 log_insights_log_groups += "~'"
 49 |                 log_insights_log_groups += urllib.parse.quote_plus(log_group)
 50 |         elif 'logGroupName' in log_input:
 51 |             log_group_name = log_input['logGroupName']            
 52 |             log_insights_log_groups = "~'"
 53 |             log_insights_log_groups += urllib.parse.quote_plus(log_group_name)
 54 |             
 55 |     log_insights_query_trimmed = log_insights_query.replace('  ','')
 56 |     encoded_log_insights_query = urllib.parse.quote_plus(log_insights_query_trimmed)
 57 |     encoded_log_insights_query_asterisks = encoded_log_insights_query.replace("%","*")
 58 |     log_insights_link = f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logsV2:logs-insights$3FqueryDetail$3D~(end~'{end_time_str}~start~'{start_time_str}~timeType~'ABSOLUTE~tz~'Local~editorString~'{encoded_log_insights_query_asterisks}~source~({log_insights_log_groups}))"
 59 |     return log_insights_link
 60 | 
 61 | @tracer.capture_method
 62 | def get_log_insights_query_results(log_group, log_insights_query, region):
 63 |     """
 64 |     Retrieves the results of a CloudWatch Logs Insights query for a given log group.
 65 | 
 66 |     Args:
 67 |         log_group (str): The name of the log group to query.
 68 |         log_insights_query (str): The query to execute on the logs.
 69 |         region (str): The AWS region of the logs.
 70 | 
 71 |     Returns:
 72 |         log_insights_query_results_html
 73 |         log_insights_query_results_json
 74 |     """
 75 | 
 76 |     logs = boto3.client('logs', region_name=region)
 77 | 
 78 |     try:
 79 |         start_query_response = logs.start_query(
 80 |             logGroupName=log_group,
 81 |             startTime=int((datetime.datetime.today() - timedelta(hours=3)).timestamp()),
 82 |             endTime=int(datetime.datetime.now().timestamp()),
 83 |             queryString=log_insights_query,
 84 |         )
 85 |     except botocore.exceptions.ClientError as error:
 86 |         logger.exception("Error starting query")
 87 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 88 |     except botocore.exceptions.ParamValidationError as error:
 89 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error)) 
 90 | 
 91 |     query_id = start_query_response['queryId']
 92 | 
 93 |     response = None
 94 | 
 95 |     try:
 96 |         while response is None or response['status'] == 'Running': 
 97 |             time.sleep(1) # nosemgrep
 98 |             response = logs.get_query_results(
 99 |                 queryId=query_id
100 |             )
101 |     except botocore.exceptions.WaiterError as error:
102 |         logger.warning("Error waiting for query to complete")
103 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error    
104 | 
105 |     log_insights_query_results_json = response['results']
106 | 
107 |     # Step 1: Extract all unique field names
108 |     fields = set()
109 |     for result in log_insights_query_results_json:
110 |         for entry in result:
111 |             fields.add(entry['field'])
112 | 
113 |     # Step 2: Construct rows
114 |     rows = []
115 |     for result in log_insights_query_results_json:
116 |         row = {field: '' for field in fields}  # Initialize all fields with empty string
117 |         for entry in result:
118 |             row[entry['field']] = entry['value']
119 |         rows.append(row)
120 | 
121 |     # Step 3: Create DataFrame
122 |     df = pd.DataFrame(rows)
123 | 
124 |     # Step 4: Convert DataFrame to HTML table
125 |     log_insights_query_results_html = df.to_html(index=False, escape=False)
126 | 
127 |     # Adjust the table
128 |     new_table_tag = '<table id="info" width="640" style="max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
129 |     log_insights_query_results_html = log_insights_query_results_html.replace('<table border="1" class="dataframe">', new_table_tag)    
130 | 
131 |     return log_insights_query_results_html, log_insights_query_results_json
132 | 
133 | 
134 | @tracer.capture_method
135 | def get_last_10_events(log_input, timestamp, region):
136 |     """
137 |     Retrieves the last 10 log events for a given log stream and creates an HTML table to display the results.
138 | 
139 |     Args:
140 |         log_input (dict): A dictionary containing information about the log stream to query. Must contain the key 'logStreamName'.
141 |         timestamp (datetime): The timestamp to use as the end time for the log event query.
142 |     
143 |     Returns:
144 |         html_table (str): A string containing an HTML table with the last 10 log events for the specified log stream.   
145 |     """
146 |     html_table = ''
147 |     logs = boto3.client('logs', region_name=region)
148 |     if 'logStreamName' in log_input:
149 |         log_stream_name = log_input['logStreamName']
150 |         log_groups = search_log_groups(log_stream_name, region)
151 |         log_events = []
152 |         for log_group in log_groups:
153 |             try:
154 |                 response = logs.filter_log_events(
155 |                     logGroupName=log_group, 
156 |                     logStreamNames=[log_stream_name], 
157 |                     limit=10, 
158 |                     endTime=int(timestamp.timestamp() * 1000)
159 |                 )
160 |             except botocore.exceptions.ClientError as error:
161 |                 logger.exception("Error filtering log events")
162 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
163 |             except botocore.exceptions.ParamValidationError as error:
164 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))            
165 | 
166 |             log_events.extend(response['events'])
167 | 
168 |             if not log_events:
169 |                 html_table = '<p>No log events found.</p>'
170 |             else:
171 |                 html_table += '<table id="info" width="640" style="max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
172 |                 html_table += f'<tr><th colspan="2">Log group: {log_group}<br>Log stream: {log_stream_name}</th></tr>'
173 |                 html_table += '<tr><th>Timestamp</th><th>Message</th></tr>'
174 |                 for event in log_events:
175 |                     timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + 'Z'                  
176 |                     message = event['message'].replace('\n', '<br>')
177 |                     html_table += f'<tr><td>{timestamp_str}</td><td style="word-break:break-all;">{message}</td></tr>'
178 |                 html_table += '</table>'
179 | 
180 |     elif 'logGroupName' in log_input:
181 |         log_group_name = log_input['logGroupName']
182 | 
183 |         try:
184 |             response = logs.filter_log_events(logGroupName=log_group_name, limit=10, endTime=int(timestamp.timestamp() * 1000))
185 |         except botocore.exceptions.ClientError as error:
186 |             logger.exception("Error filtering log events")
187 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
188 |         except botocore.exceptions.ParamValidationError as error:
189 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error)) 
190 |         
191 |         log_events = response['events']
192 |         
193 |         if not log_events:
194 |             html_table += '<table id="info"width="640" style="max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
195 |             html_table += f'<tr><th colspan="2">Log group: {log_group_name}<br>Log stream: N/A</th></tr>'
196 |             html_table += '<tr><th>Timestamp</th><th>Message</th></tr>'
197 |             html_table += f'<tr><td colspan="2"><p>No log events found in the time period specified.</p></td></tr>'
198 |             html_table += '</table>'            
199 |         else:
200 |             log_stream_name = log_events[0]['logStreamName']
201 |             html_table += '<table id="info"width="640" style="max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
202 |             html_table += f'<tr><th colspan="2">Log group: {log_group_name}<br>Log stream: {log_stream_name}</th></tr>'
203 |             html_table += '<tr><th>Timestamp</th><th>Message</th></tr>'
204 |             for event in log_events:
205 |                 timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + 'Z'                  
206 |                 message = event['message'].replace('\n', '<br>')
207 |                 html_table += f'<tr><td>{timestamp_str}</td><td style="word-break:break-all;">{message}</td></tr>'
208 |             html_table += '</table>'
209 | 
210 |     return html_table, log_events
211 | 
212 | @tracer.capture_method
213 | def search_log_groups(log_stream_name, region):
214 |     """
215 |     Searches for all log groups that contain a given log stream name and returns the filtered list of log group names.
216 |     
217 |     Args:
218 |     
219 |     log_stream_name: The name of the log stream to search for.
220 |     Returns:
221 |     
222 |     A list of log group names that contain the given log stream name.
223 |     """   
224 |     logs = boto3.client('logs', region_name=region) 
225 |     try:
226 |         paginator = logs.get_paginator('describe_log_groups')
227 |         log_groups_list = []
228 |         for page in paginator.paginate():
229 |             log_groups_list.extend(page['logGroups'])
230 |         response = {'logGroups': log_groups_list}        
231 |     except botocore.exceptions.ClientError as error:
232 |         logger.exception("Error describing log groups")
233 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
234 |     except botocore.exceptions.ParamValidationError as error:
235 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error)) 
236 | 
237 |     log_groups = response['logGroups']
238 | 
239 | 
240 |     filtered_log_groups = []
241 |     for log_group in log_groups:
242 |         try:
243 |             response = logs.describe_log_streams(logGroupName=log_group['logGroupName'], logStreamNamePrefix=log_stream_name, limit=1)
244 |         except botocore.exceptions.ClientError as error:
245 |             logger.exception("Error describing Log Groups")
246 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
247 |         except botocore.exceptions.ParamValidationError as error:
248 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error))         
249 |         if response['logStreams']:
250 |             filtered_log_groups.append(log_group['logGroupName'])                   
251 |     return filtered_log_groups
252 |     
253 | @tracer.capture_method
254 | def check_log_group_exists(log_group_name, region):
255 |     """
256 |     Checks whether the specified log group exists in AWS CloudWatch Logs.
257 |     
258 |     Args:
259 |     - log_group_name: The name of the log group to check.
260 |     
261 |     Returns:
262 |     - A boolean value indicating whether the log group exists (True) or not (False).
263 |     """    
264 |     logs = boto3.client('logs', region_name=region)
265 | 
266 |     try:
267 |         paginator = logs.get_paginator('describe_log_groups')
268 |         log_groups_list = []
269 |         for page in paginator.paginate(logGroupNamePrefix=log_group_name):
270 |             log_groups_list.extend(page['logGroups'])
271 |         response = {'logGroups': log_groups_list}        
272 |     except botocore.exceptions.ClientError as error:
273 |         logger.exception("Error describing log groups")
274 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
275 |     except botocore.exceptions.ParamValidationError as error:
276 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error))          
277 | 
278 |     if not response['logGroups']:
279 |         return False
280 |     else:
281 |         return True    
282 |         


--------------------------------------------------------------------------------
/alarm_context_tool/functions_xray.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | import json
  5 | import datetime
  6 | import pandas as pd
  7 | 
  8 | from functions import json_serial
  9 | from functions import get_dashboard_button
 10 | 
 11 | from aws_lambda_powertools import Logger
 12 | from aws_lambda_powertools import Tracer
 13 | logger = Logger()
 14 | tracer = Tracer()
 15 | 
 16 | @tracer.capture_method
 17 | def process_traces(filter_expression, region, trace_start_time, trace_end_time):
 18 |     # Initialize the boto3 client for AWS X-Ray
 19 |     xray = boto3.client('xray', region_name=region)   
 20 | 
 21 |     # Sometimes alarms are triggered by issues where there is no error or fault in the trace
 22 |     # Subtract another 21 hours
 23 |     start_datetime = datetime.datetime.strptime(trace_start_time, '%Y-%m-%dT%H:%M:%S.%f%z')
 24 |     adjusted_datetime = start_datetime - datetime.timedelta(hours=21)
 25 |     trace_start_time = adjusted_datetime.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + adjusted_datetime.strftime('%z')          
 26 | 
 27 |     try:
 28 |         # Retrieve the trace summaries
 29 |         response = xray.get_trace_summaries(
 30 |             StartTime=trace_start_time,
 31 |             EndTime=trace_end_time,
 32 |             TimeRangeType='Event',
 33 |             Sampling=False,
 34 |             FilterExpression=filter_expression
 35 |         )       
 36 |     except botocore.exceptions.ClientError as error:
 37 |         logger.exception("Error getting trace summaries")
 38 |         raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 39 |     except botocore.exceptions.ParamValidationError as error:
 40 |         raise ValueError('The parameters you provided are incorrect: {}'.format(error))
 41 | 
 42 |     MAX_TRACE_SUMMARIES = 3
 43 |     limited_trace_summaries = response.get('TraceSummaries', [])[:MAX_TRACE_SUMMARIES]
 44 | 
 45 |     trace_summary = {
 46 |         "TraceSummaries": limited_trace_summaries,
 47 |         # Include other keys from the original response if necessary
 48 |         # "UnprocessedTraceIds": response.get("UnprocessedTraceIds", []),
 49 |         # "NextToken": response.get("NextToken", None),
 50 |         # "ApproximateTime": response.get("ApproximateTime", None),
 51 |         # "TracesProcessedCount": len(limited_trace_summaries),
 52 |     }
 53 | 
 54 |     logger.info("Trace Summary", extra=trace_summary)        
 55 |     
 56 | 
 57 |     # Create a table containing the resources in the trace
 58 |     # Initialize list for combined data
 59 |     combined_data = []
 60 | 
 61 |     # Extract and combine service IDs with Type AWS::EC2::Instance and their InstanceIds
 62 |     for summary in response["TraceSummaries"]:
 63 |         instance_ids = [instance["Id"] for instance in summary.get("InstanceIds", [])]
 64 |         for service in summary["ServiceIds"]:
 65 |             service_name = service.get("Name", "Unknown")
 66 |             service_type = service.get("Type", "Unknown")  # Provide a default value for 'Type' if it's missing
 67 | 
 68 |             # Special treatment for EC2 instance types
 69 |             if service_type == "AWS::EC2::Instance":
 70 |                 for instance_id in instance_ids:
 71 |                     combined_data.append({"Name": service_name, "Type": service_type, "InstanceId": instance_id})
 72 |             else:
 73 |                 # General treatment for all other service types
 74 |                 combined_data.append({"Name": service_name, "Type": service_type, "InstanceId": None})
 75 |     
 76 |     # Process the data
 77 |     df_combined = pd.DataFrame(combined_data).drop_duplicates().reset_index(drop=True)
 78 |     html_combined = df_combined.to_html(index=False)
 79 | 
 80 |     # Adjust the table
 81 |     new_table_tag = '<table id="info" width="640" style="max-width:640px !important; border-collapse: collapse; margin-bottom:10px;" cellpadding="2" cellspacing="0" width="100%" align="center" border="0">'
 82 |     html_combined = html_combined.replace('<table border="1" class="dataframe">', new_table_tag)
 83 |     html_combined = html_combined.replace('<tr style="text-align: right;">','<tr>')
 84 |     html_combined = html_combined.replace('<thead>', f'<thead><tr><th colspan="3" style="text-align: center;">Resources in Trace</th></tr>')
 85 |     
 86 |     # Extract the latest trace ID
 87 |     if response["TraceSummaries"]:
 88 |         latest_trace = max(response["TraceSummaries"], key=lambda trace: trace["StartTime"]) 
 89 |         trace_id = latest_trace["Id"]    
 90 |     else:
 91 |         trace_id = None
 92 |     
 93 |     if trace_id:
 94 |         try:
 95 |             response = xray.batch_get_traces(TraceIds=[trace_id])
 96 |         except botocore.exceptions.ClientError as error:
 97 |             logger.exception("Error retrieving trace")
 98 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 99 |         except botocore.exceptions.ParamValidationError as error:
100 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error))            
101 |         
102 |         logger.info("Traces", traces=response)
103 |         
104 |         # Assuming there is only one trace in the batch
105 |         trace_data = response['Traces'][0]
106 |         html = generate_trace_html(response, region, trace_start_time, trace_end_time)  
107 |         
108 |         # Minimize the HTML content by removing newlines and redundant whitespace
109 |         minimized_trace_html_content = html_combined
110 |         minimized_trace_html_content += ' '.join(html.split())
111 |         
112 |         # Log the minimized HTML content to the logs in one line
113 |         logger.info("Trace HTML", html=minimized_trace_html_content)                
114 |     else:
115 |         logger.info("No trace ID found in the summary.")
116 |         minimized_trace_html_content = "" 
117 |     
118 |     return trace_summary, minimized_trace_html_content
119 | 
120 | @tracer.capture_method
121 | def generate_trace_html(traces_response, region, start_time, end_time):
122 |     
123 |     for trace in traces_response.get('Traces', []):
124 |         trace_id =  trace.get('Id') 
125 |    
126 |    
127 |    # Check if start_time and end_time are string instances and parse them if true
128 |     if isinstance(start_time, str):
129 |         start_time = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S.%f%z')
130 |     if isinstance(end_time, str):
131 |         end_time = datetime.datetime.strptime(end_time, '%Y-%m-%dT%H:%M:%S.%f%z')
132 | 
133 |     # Format start_time and end_time to strings as needed
134 |     start_time_str = start_time.strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3] +"Z"
135 |     end_time_str = end_time.strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3] +"Z"
136 |     
137 |     
138 |     link = f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#xray:traces/{trace_id}?~(query~()~context~(timeRange~(end~'{end_time_str}~start~'{start_time_str})))"
139 |     button = get_dashboard_button("Trace %s details" % (str(trace_id)), link)
140 |         
141 |     html_output = f"""
142 |     <table id="traces" width="640" cellspacing="0" align="center" border="0" style="max-width:640px!important; border-collapse:collapse; margin-bottom:10px">   
143 |         <tr>
144 |             <td colspan="5" style="font-weight: bold; padding: 5px; border: 1px solid #ddd;">{button}</td>
145 |         </tr>    
146 |         <tr>
147 |             <td style="padding: 2px; border: 1px solid #ddd; font-size: small; max-width:155px;" width="155">Node</td>
148 |             <td style="padding: 2px; border: 1px solid #ddd; font-size: small; max-width:40px;" width="40">Stat.</td>
149 |             <td style="padding: 2px; border: 1px solid #ddd; font-size: small; max-width:40px;" width="40">Resp.</td>
150 |             <td style="padding: 2px; border: 1px solid #ddd; font-size: small; max-width:40px;" width="40">Dur.</td>
151 |             <td style="padding: 2px; border: 1px solid #ddd; font-size: small; min-width:340px;">Timeline</td>
152 |         </tr>
153 |     """
154 |     
155 |     # Determine the earliest start time and latest end time for overall scaling
156 |     earliest_start = float('inf')
157 | 
158 |     # Extract all segments and sort them
159 |     all_segments = []
160 |     for trace in traces_response.get('Traces', []):
161 |         timeline_scale =  trace.get('Duration') 
162 |         for segment in trace.get('Segments', []):
163 |             segment_doc = json.loads(segment['Document'])
164 |             earliest_start = min(earliest_start, segment_doc.get('start_time', earliest_start))
165 |             all_segments.append(segment_doc)
166 | 
167 |     # Sort segments by start time
168 |     sorted_segments = sorted(all_segments, key=lambda x: x.get('start_time', 0))
169 |     
170 |     for segment_doc in sorted_segments:
171 |         html_output += process_trace_segment(segment_doc, earliest_start, timeline_scale)
172 | 
173 |     # HTML boilerplate end
174 |     html_output += """
175 |     </table>
176 |     """
177 |     return html_output
178 |     
179 | @tracer.capture_method
180 | def process_trace_segment(segment_doc, earliest_start, timeline_scale, is_subsegment=False):
181 |     name = segment_doc.get('name', 'Unknown')
182 |     origin = segment_doc.get('origin', '')
183 |     start_time = segment_doc.get('start_time', 0)
184 |     end_time = segment_doc.get('end_time', 0)
185 |     duration = (end_time - start_time)
186 |     offset = (start_time - earliest_start) / timeline_scale * 100
187 |     bar_width = duration / timeline_scale * 100
188 |     duration_in_ms = round(duration * 1000)
189 |     response_code = segment_doc.get('http', {}).get('response', {}).get('status', '-')
190 | 
191 |     # Set Status:
192 |     if segment_doc.get('fault'):
193 |         status = "Fault"
194 |         color = "#fe6e73"  # Reddish for fault
195 |     elif segment_doc.get('error'):
196 |         status = "Error"
197 |         color = "#c59600"  # Yellowish for error
198 |     elif segment_doc.get('throttle'):
199 |         status = "Throttle"
200 |         color = "#b088f5"  # Purplish for throttle
201 |     else:
202 |         status = "OK"
203 |         color = "#4CAF50"  # Green for OK  
204 |     
205 |     html_output = ""
206 | 
207 |     bar_container_style = "position: relative; width: 100%; background-color: #ddd; height: 20px; min-width: 340px;"
208 |     bar_style = f"position: absolute; height: 100%; background-color: {color}; left: {offset}%; width: {bar_width}%;"
209 |     td_style = "padding: 2px; border: 1px solid #ddd; overflow: hidden; white-space: nowrap; text-overflow: ellipsis; font-size: small;"
210 | 
211 |     if not is_subsegment:
212 |         html_output += f"""
213 |             <tr>
214 |                 <td colspan="5" style="font-weight: bold; padding: 5px; border: 1px solid #ddd;">{name + ('&nbsp;&nbsp;&nbsp;&nbsp;' + origin if origin != '' else '')}</td>
215 |             </tr>
216 |         """   
217 |     
218 |     html_output += f"""
219 |         <tr>
220 |             <td style="{td_style} max-width:155px;" width="155">&nbsp;&nbsp;&nbsp;&nbsp;{name}</td>
221 |             <td style="{td_style} max-width:40px; color:{color};" width="40">{status}</td>
222 |             <td style="{td_style} max-width:40px;" width="40"">{response_code}</td>
223 |             <td style="{td_style} max-width:40px;" width="40"">{duration_in_ms}ms</td>
224 |             <td style="{td_style}">
225 |                 <div style="{bar_container_style}">
226 |                     <div style="{bar_style}"></div>
227 |                 </div>
228 |             </td>
229 |         </tr>
230 |     """
231 | 
232 |     # Process subsegments if they exist
233 |     subsegments = segment_doc.get('subsegments', [])
234 |     sorted_subsegments = sorted(subsegments, key=lambda x: x.get('start_time', 0))
235 | 
236 |     for subsegment in sorted_subsegments:
237 |         html_output += process_trace_segment(subsegment, earliest_start, timeline_scale, True)
238 |         
239 |     return html_output
240 | 


--------------------------------------------------------------------------------
/alarm_context_tool/health_client.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | from region_lookup import active_region
 4 | import boto3
 5 | from aws_lambda_powertools import Logger
 6 | from aws_lambda_powertools import Tracer
 7 | logger = Logger()
 8 | tracer = Tracer()
 9 | 
10 | class ActiveRegionHasChangedError(Exception):
11 |     """Rasied when the active region has changed"""
12 |     pass
13 | 
14 | class HealthClient:
15 |     __active_region = None
16 |     __client = None
17 | 
18 |     @staticmethod
19 |     @tracer.capture_method
20 |     def client():
21 |         if not HealthClient.__active_region:
22 |             HealthClient.__active_region = active_region()
23 |         else:
24 |             current_active_region = active_region()
25 |             if current_active_region != HealthClient.__active_region:
26 |                 old_active_region = HealthClient.__active_region
27 |                 HealthClient.__active_region = current_active_region
28 | 
29 |                 if HealthClient.__client:
30 |                     HealthClient.__client = None
31 | 
32 |                 raise ActiveRegionHasChangedError('Active region has changed from [' + old_active_region + '] to [' + current_active_region + ']')
33 | 
34 |         if not HealthClient.__client:
35 |             HealthClient.__client = boto3.client('health', region_name=HealthClient.__active_region)
36 | 
37 |         return HealthClient.__client


--------------------------------------------------------------------------------
/alarm_context_tool/lambda_function.py:
--------------------------------------------------------------------------------
  1 | # Import required libraries and modules
  2 | import boto3
  3 | import json
  4 | import os
  5 | import datetime
  6 | import base64
  7 | 
  8 | # Import custom handlers and functions
  9 | import sns_handler
 10 | import ec2_handler
 11 | import synthetics_handler
 12 | import dynamodb_handler
 13 | import ecs_handler
 14 | import lambda_handler
 15 | import ssm_run_command_handler
 16 | import application_elb_handler
 17 | import api_gateway_handler
 18 | import rds_handler
 19 | import s3_handler
 20 | import eks_handler
 21 | 
 22 | from email.mime.multipart import MIMEMultipart
 23 | from email.mime.text import MIMEText
 24 | from email.mime.application import MIMEApplication
 25 | 
 26 | from functions import get_html_table
 27 | from functions_metrics import generate_main_metric_widget
 28 | from functions_metrics import get_metric_data
 29 | from functions import create_test_case
 30 | from functions_metrics import get_metric_array
 31 | from functions_health import describe_events
 32 | from functions_email import build_email_summary
 33 | from functions_email import get_generic_links
 34 | from functions_email import send_email
 35 | from functions_email import build_html_body
 36 | 
 37 | from functions_alarm import get_alarm_history
 38 | from functions_cloudformation import get_cloudformation_template
 39 | from functions_bedrock import construct_prompt
 40 | from functions_bedrock import execute_prompt
 41 | 
 42 | from health_client import ActiveRegionHasChangedError
 43 | 
 44 | from aws_lambda_powertools import Logger
 45 | from aws_lambda_powertools import Tracer
 46 | from aws_lambda_powertools.utilities.typing import LambdaContext
 47 | logger = Logger()
 48 | tracer = Tracer()
 49 | 
 50 | 
 51 | @logger.inject_lambda_context(log_event=True)
 52 | @tracer.capture_lambda_handler
 53 | def alarm_handler(event, context):
 54 |     """
 55 |     Lambda function handler to process CloudWatch alarms.
 56 |     
 57 |     Args:
 58 |         event (dict): Lambda event payload.
 59 |         context (LambdaContext): Lambda context object.
 60 |     """
 61 |     # Log Boto 3 version
 62 |     fields = {"boto3_version": boto3.__version__}
 63 |     logger.info("Starting", extra=fields)
 64 | 
 65 |     # Log JSON that can be used as a test case for this Lambda function
 66 |     test_case = create_test_case(event)
 67 |     logger.info("test_case", extra=test_case)
 68 | 
 69 |     # =============================================================================
 70 |     # Section: Initial variables
 71 |     # =============================================================================
 72 | 
 73 |     message = json.loads(event['Records'][0]['Sns']['Message'])
 74 |     alarm_name = message['AlarmName']
 75 |     alarm_description = message['AlarmDescription']
 76 |     new_state = message['NewStateValue']
 77 |     reason = message['NewStateReason']
 78 |     state_change_time = message['StateChangeTime']
 79 |     alarm_arn = message['AlarmArn']
 80 |     region_name = message['Region']
 81 | 
 82 |     # Get array of metrics and variables for first metric
 83 |     namespace, metric_name, statistic, dimensions, metrics_array = get_metric_array(message['Trigger'])
 84 | 
 85 |     # Add annotations to trace for Namespace and dimensions
 86 |     tracer.put_annotation(key="Namespace", value=namespace)
 87 |     for elements in dimensions:
 88 |         tracer.put_annotation(key=elements['name'], value=elements['value'])
 89 | 
 90 |     # Datetime variables
 91 |     change_time = datetime.datetime.strptime(
 92 |         state_change_time, "%Y-%m-%dT%H:%M:%S.%f%z")
 93 |     annotation_time = change_time.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
 94 |     start = change_time + datetime.timedelta(minutes=-115)
 95 |     start_time = start.strftime(
 96 |         '%Y-%m-%dT%H:%M:%S.%f')[:-3] + start.strftime('%z')
 97 |     end = change_time + datetime.timedelta(minutes=5)
 98 |     end_time = end.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + end.strftime('%z')
 99 |     display_change_time = change_time.strftime("%A %d %B, %Y %H:%M:%S %Z")
100 | 
101 |     # Extract Region and Account ID from alarm ARN
102 |     elements = alarm_arn.split(':')
103 |     result = {
104 |         'arn': elements[0],
105 |         'partition': elements[1],
106 |         'service': elements[2],
107 |         'region': elements[3],
108 |         'account_id': elements[4],
109 |         'resource_type': elements[5],
110 |         'resource_id': elements[6]
111 |     }
112 |     region = result['region']
113 |     account_id = result['account_id']
114 | 
115 |     # =============================================================================
116 |     # Section: Process alarm by namespace
117 |     # =============================================================================
118 | 
119 |     namespace_defined = True
120 |     logger.info(dimensions)
121 | 
122 |     if namespace == "AWS/EC2":
123 |         response = ec2_handler.process_ec2(metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
124 | 
125 |     elif namespace == "CloudWatchSynthetics":
126 |         response = synthetics_handler.process_synthetics(
127 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
128 | 
129 |     elif namespace == "AWS/SNS":
130 |         response = sns_handler.process_sns_topic(
131 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
132 | 
133 |     elif namespace == "AWS/DynamoDB":
134 |         response = dynamodb_handler.process_dynamodb(
135 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
136 | 
137 |     elif namespace in ("AWS/ECS", "ECS/ContainerInsights"):
138 |         response = ecs_handler.process_ecs(
139 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
140 | 
141 |     elif namespace == "AWS/Lambda":
142 |         response = lambda_handler.process_lambda(
143 |             metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
144 | 
145 |     elif namespace == "AWS/SSM-RunCommand":
146 |         response = ssm_run_command_handler.process_ssm_run_command(
147 |             metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
148 | 
149 |     elif namespace == "AWS/ApplicationELB":
150 |         response = application_elb_handler.process_application_elb(
151 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
152 | 
153 |     elif namespace == "AWS/ApiGateway":
154 |         response = api_gateway_handler.process_api_gateway(
155 |             dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end)
156 |     
157 |     elif namespace == "AWS/RDS":
158 |         response = rds_handler.process_rds(metric_name, dimensions, region, account_id,
159 |                                            namespace, change_time, annotation_time, start_time, end_time, start, end)
160 | 
161 |     elif namespace == "AWS/S3" or namespace == "AWS/S3/Storage-Lens":
162 |         response = s3_handler.process_s3(metric_name, dimensions, region, account_id,
163 |                                          namespace, change_time, annotation_time, start_time, end_time, start, end)
164 | 
165 |     elif namespace == "ContainerInsights":
166 |         response = eks_handler.process_eks(metric_name, dimensions, region, account_id,
167 |                                            namespace, change_time, annotation_time, start_time, end_time, start, end)
168 | 
169 |     else:
170 |         # Namespace not matched
171 |         # TO DO: use describe-metric-filters to see if this is a metric filter metric and then get log data.
172 |         contextual_links = None
173 |         log_information = None
174 |         log_events = None
175 |         resource_information = None
176 |         resource_information_object = None
177 |         widget_images = None
178 |         additional_metrics_with_timestamps_removed = None
179 |         trace_summary = None
180 |         trace_html = None
181 |         notifications = None
182 |         tags = None
183 |         namespace_defined = False
184 |         logger.info("undefined_namespace_dimensions",
185 |                     extra={"namespace": namespace})
186 | 
187 |     # =============================================================================
188 |     # Section: Build Email
189 |     # =============================================================================
190 | 
191 |     text_summary = 'Your Amazon CloudWatch Alarm "%s" in the %s region has entered the %s state, because "%s" at "%s".' % (
192 |         alarm_name, region_name, new_state, reason, display_change_time)
193 |     summary = build_email_summary(
194 |         alarm_name, region_name, new_state, reason, display_change_time, alarm_description, region)
195 | 
196 |     # Metric Details
197 |     metric_details = get_html_table("Metrics", message['Trigger'])
198 | 
199 |     # Alarm Details - Remove Trigger to avoid duplication
200 |     alarm_display = dict(message)
201 |     alarm_display.pop("Trigger", None)
202 |     alarm_details = get_html_table("Alarm", alarm_display)
203 | 
204 |     # Get Generic Links
205 |     generic_information = get_generic_links(region)
206 |     additional_information = generic_information
207 | 
208 |     if namespace_defined:
209 |         contextual_links = response.get("contextual_links")
210 |         log_information = response.get("log_information")
211 |         log_events = response.get("log_events")
212 |         resource_information = response.get("resource_information")
213 |         resource_information_object = response.get("resource_information_object")
214 |         notifications = response.get("notifications")
215 |         widget_images = response.get("widget_images")
216 |         additional_metrics_with_timestamps_removed = response.get("additional_metrics_with_timestamps_removed")
217 |         trace_summary = response.get("trace_summary")
218 |         trace_html = response.get("trace")
219 |         tags = response.get("tags", [])
220 | 
221 |         if notifications is not None:
222 |             summary += notifications
223 |         if contextual_links is not None:
224 |             additional_information += contextual_links
225 |         if log_information is not None:
226 |             additional_information += log_information
227 |         if resource_information is not None:
228 |             additional_information += resource_information
229 | 
230 |     # Get main widget
231 |     graph = generate_main_metric_widget(metrics_array, annotation_time, region, start_time, end_time)
232 | 
233 |     # Get metric data
234 |     metric_data = get_metric_data(region, message['Trigger'], metric_name, account_id, change_time, end_time)
235 | 
236 |     # Alarm History
237 |     alarm_history = get_alarm_history(region, alarm_name)
238 | 
239 |     # AWS Health - See https://github.com/aws/aws-health-tools/tree/master/high-availability-endpoint/python
240 | 
241 |     # If you don't have Business or a higher level of support the below code will give a SubscriptionRequiredError, see (https://docs.aws.amazon.com/health/latest/APIReference/API_EnableHealthServiceAccessForOrganization.html)
242 |     restart_workflow = True
243 |     while restart_workflow:
244 |         try:
245 |             health_events = describe_events(region)
246 |             restart_workflow = False
247 |         except ActiveRegionHasChangedError as are:
248 |             logger.info("The AWS Health API active region has changed. Restarting the workflow using the new active region!, %s", are)
249 |         except:
250 |             health_events = None
251 |             restart_workflow = False
252 | 
253 |     # Get truncated CloudFormation template
254 |     if tags:
255 |         max_length = 50  # Maximum length of CloudFormation Value to shorten prompt
256 |         truncated_cloudformation_template = get_cloudformation_template(
257 |             tags, region, trace_summary, max_length)
258 |     else:
259 |         truncated_cloudformation_template = None
260 | 
261 |     # Contruct Bedrock prompt
262 |     prompt = construct_prompt(alarm_history, message, metric_data, text_summary, health_events, truncated_cloudformation_template,
263 |                               resource_information_object, log_events, additional_metrics_with_timestamps_removed, trace_summary)
264 |     logger.info("bedrock_prompt", prompt=prompt)
265 | 
266 |     # Execute Bedrock Prompt
267 |     ai_response = execute_prompt(prompt)
268 | 
269 |     # =============================================================================
270 |     # Section: Create attachments
271 |     # =============================================================================
272 | 
273 |     sender = os.environ.get('SENDER')
274 |     recipient = os.environ.get('RECIPIENT')
275 |     subject = "ALARM: " + alarm_name
276 |     BODY_TEXT = text_summary
277 | 
278 |     # Deal with attachments
279 |     attachments = []
280 | 
281 |     # Base64 Link Icon
282 |     link_icon_data = base64.b64decode(
283 |         b'iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAj0lEQVQ4ja2SwQ3CMBAEx6kgedIFbVAHaZIeQhmEdLF8LshEd9YZWGllS/aMbclIukha5QdrmCJpBU74KTYqWGeo4OKUw9oE3D8MznWjjpIW27vUUEZwhKcegQeTFURwStCC340EKbgluCXg5hPOJglP3qEiSdVn6Yn2n/hT/iJ42lydBdgGYAa2Lw5/ANcX9a8GnTGB0iAAAAAASUVORK5CYII=')
284 |     attachments.append({"filename": "link_icon.png",
285 |                        "data": link_icon_data, "id": "<imageId2>"})
286 | 
287 |     # Main Widget Graph
288 |     attachments.append({"filename": "main_widget_graph.png",
289 |                        "data": graph, "id": "<imageId>"})
290 | 
291 |     # Widget Images
292 |     if widget_images:
293 |         for widget_image in widget_images:
294 |             filename = f'{widget_image["widget"].replace(" ", "_")}.png'
295 |             content_id = f'<{widget_image["widget"].replace(" ", "_")}>'
296 |             attachments.append(
297 |                 {"filename": filename, "data": widget_image['data'], "id": content_id})
298 | 
299 |     # Get HTML
300 |     BODY_HTML = build_html_body(subject, summary, ai_response, widget_images,
301 |                                 trace_html, additional_information, alarm_details, metric_details)
302 | 
303 |     send_email(
304 |         sender=sender,
305 |         recipient=recipient,
306 |         subject=subject,
307 |         body_text=BODY_TEXT,
308 |         body_html=BODY_HTML,
309 |         attachments=attachments
310 |     )
311 | 


--------------------------------------------------------------------------------
/alarm_context_tool/lambda_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | import datetime
  5 | 
  6 | from functions import get_dashboard_button
  7 | from functions import get_html_table
  8 | from functions_logs import get_last_10_events
  9 | from functions_logs import get_log_insights_link
 10 | from functions_metrics import build_dashboard
 11 | from functions_metrics import get_metrics_from_dashboard_metrics
 12 | from functions_xray import process_traces
 13 | 
 14 | from aws_lambda_powertools import Logger
 15 | from aws_lambda_powertools import Tracer
 16 | logger = Logger()
 17 | tracer = Tracer()
 18 | 
 19 | @tracer.capture_method
 20 | def process_lambda(metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 21 | 
 22 |     lambda_automatic_dashboard_link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/Lambda?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 23 |     contextual_links = get_dashboard_button("Lambda automatic dashboard" , lambda_automatic_dashboard_link) 
 24 | 
 25 |     if dimensions:
 26 |         for elements in dimensions:
 27 |             if elements['name'] == 'FunctionName':
 28 |                 id = elements['value']
 29 | 
 30 |                 lambda_automatic_dashboard_link = 'https://%s.console.aws.amazon.com/lambda/home?region=%s#/functions/%s?tab=monitoring' % (region, region, str(id))   
 31 |                 contextual_links += get_dashboard_button("Lambda Function Monitoring" , lambda_automatic_dashboard_link)                
 32 | 
 33 |                 # Get Function
 34 |                 lambda_client = boto3.client('lambda', region_name=region)
 35 |                 try:
 36 |                     response = lambda_client.get_function(FunctionName=id)
 37 |                 except botocore.exceptions.ClientError as error:
 38 |                     logger.exception("Error getting Lambda Function")
 39 |                     raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 40 |                 except botocore.exceptions.ParamValidationError as error:
 41 |                     raise ValueError('The parameters you provided are incorrect: {}'.format(error))             
 42 |                 
 43 |                 layers = response['Configuration']['Layers']
 44 | 
 45 |                 # Code is too noisy, remove it
 46 |                 response.pop("Code", None)
 47 |                 resource_information = get_html_table("Function: " +id, response["Configuration"])
 48 |                 resource_information += get_html_table("Function: " +id, response["Tags"])            
 49 |                 resource_information_object = response["Configuration"]
 50 |                 
 51 |                 # Check if Lambda Insights is Enabled
 52 |                 lambda_insights_enabled = False
 53 |                 for layer in layers:
 54 |                     if layer['Arn'].startswith('arn:aws:lambda:'):
 55 |                         layer_name_version = layer['Arn'].split(':')[-2]
 56 |                         if layer_name_version.startswith('LambdaInsightsExtension'):
 57 |                             lambda_insights_enabled = True
 58 | 
 59 |                 if lambda_insights_enabled:
 60 |                     lambda_insights_link = f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#lambda-insights:functions/{id}"
 61 |                     contextual_links += get_dashboard_button("Lambda Insights" , lambda_insights_link) 
 62 |                 else:
 63 |                     notifications = '<p>You do not have Lambda Insights enabled for this Lambda function. CloudWatch Lambda Insights is a monitoring and troubleshooting solution for serverless applications running on AWS Lambda. The solution collects, aggregates, and summarizes system-level metrics including CPU time, memory, disk and network usage. It also collects, aggregates, and summarizes diagnostic information such as cold starts and Lambda worker shutdowns to help you isolate issues with your Lambda functions and resolve them quickly.<a href="https://%s.console.aws.amazon.com/lambda/home?region=%s#/functions/%s/edit/monitoring-tools?tab=configure">Enable Lambda Insights</a>' % (region, region, id)                
 64 | 
 65 |                 dashboard_metrics = [
 66 |                     {
 67 |                         "title": "Invocations",
 68 |                         "view": "timeSeries",
 69 |                         "stacked": False,
 70 |                         "stat": "Sum",
 71 |                         "period": 60,
 72 |                         "metrics": [
 73 |                             [namespace, "Invocations", "FunctionName", id],
 74 |                         ]
 75 |                     },
 76 |                     {
 77 |                         "title": "Duration",
 78 |                         "view": "timeSeries",
 79 |                         "stacked": False,
 80 |                         "stat": "Average",
 81 |                         "period": 60,
 82 |                         "metrics": [
 83 |                             [namespace, "Duration", 'FunctionName', id]
 84 |                         ]
 85 |                     },
 86 |                     {
 87 |                         "title": "Errors",
 88 |                         "view": "timeSeries",
 89 |                         "stacked": False,
 90 |                         "stat": "Sum",
 91 |                         "period": 60,
 92 |                         "metrics": [
 93 |                             [namespace, "Errors", 'FunctionName', id]
 94 |                         ]
 95 |                     },
 96 |                     {
 97 |                         "title": "Throttles",
 98 |                         "view": "timeSeries",
 99 |                         "stacked": False,
100 |                         "stat": "Sum",
101 |                         "period": 60,
102 |                         "metrics": [
103 |                             [namespace, "Throttles", 'FunctionName', id]
104 |                         ]
105 |                     }
106 |                 ]
107 |                 
108 |                 if lambda_insights_enabled:
109 |                     dashboard_metrics.append({
110 |                         "title": "Memory Utilization",
111 |                         "view": "timeSeries",
112 |                         "stacked": False,
113 |                         "stat": "Maximum",
114 |                         "period": 60,
115 |                         "metrics": [
116 |                             ["LambdaInsights", "memory_utilization", "function_name", id]
117 |                         ]
118 |                     })
119 |                     dashboard_metrics.append({
120 |                         "title": "CPU Total Time",
121 |                         "view": "timeSeries",
122 |                         "stacked": False,
123 |                         "stat": "Maximum",
124 |                         "period": 60,
125 |                         "metrics": [
126 |                             ["LambdaInsights", "cpu_total_time", "function_name", id]
127 |                         ]
128 |                     })
129 |                     dashboard_metrics.append({
130 |                         "title": "Total Network",
131 |                         "view": "timeSeries",
132 |                         "stacked": False,
133 |                         "stat": "Maximum",
134 |                         "period": 60,
135 |                         "metrics": [
136 |                             ["LambdaInsights", "total_network", "function_name", id]
137 |                         ]
138 |                     })
139 | 
140 |                 widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
141 |                 additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)
142 | 
143 |                 
144 |                 log_input = {"logGroupName": "/aws/lambda/" +id}
145 |                 log_information, log_events =  get_last_10_events(log_input, change_time, region) 
146 |                 
147 |                 # Log Insights Link
148 |                 log_insights_query = """filter @message like /(?i)(Exception|error|fail)/ or @message LIKE /Task timed out/
149 |                     | fields @timestamp, @message 
150 |                     | sort @timestamp desc 
151 |                     | limit 100"""
152 |                 log_insights_link = get_log_insights_link(log_input, log_insights_query, region, start_time, end_time)
153 |                 contextual_links += get_dashboard_button("Log Insights" , log_insights_link)                 
154 |                         
155 |                 # These date formats are required for some console URLs
156 |                 start_time_str = str(datetime.datetime.strptime(start_time,'%Y-%m-%dT%H:%M:%S.%f%z').strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3]) +"Z"
157 |                 end_time_str = str(datetime.datetime.strptime(end_time,'%Y-%m-%dT%H:%M:%S.%f%z').strftime('%Y-%m-%dT%H*3a%M*3a%S.%f')[:-3]) +"Z"
158 |                 
159 |                 # Check if active tracing is enabled
160 |                 if response["Configuration"]["TracingConfig"]["Mode"] == "Active":
161 |                     x_ray_traces_link = f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#xray:traces/query?~(query~(filter~()~expression~'service*28id*28name*3a*20*22{id}*22*2c*20type*3a*20*22AWS*3a*3aLambda*3a*3aFunction*22*29*29)~context~(timeRange~(end~'{end_time_str}~start~'{start_time_str})))"
162 |                     contextual_links += get_dashboard_button("X-Ray Traces" , x_ray_traces_link)
163 | 
164 |                     # Get Trace information
165 |                     filter_expression = f'!OK and service(id(name: "{id}", type: "AWS::Lambda::Function")) AND service(id(account.id: "{account_id}"))'
166 |                     logger.info("X-Ray Filter Expression", filter_expression=filter_expression)
167 |                     trace_summary, trace = process_traces(filter_expression, region, start_time, end_time)                
168 |                 else:
169 |                     trace_summary = None
170 |                     trace = None
171 |                
172 |             else:
173 |                 contextual_links = None
174 |                 log_information = None
175 |                 log_events = None
176 |                 resource_information = None
177 |                 resource_information_object = None
178 |                 widget_images = None
179 |                 additional_metrics_with_timestamps_removed = None
180 |                 trace_summary = None
181 |                 trace = None
182 |                 notifications = None
183 |         return {
184 |             "contextual_links": contextual_links,
185 |             "log_information": log_information,
186 |             "log_events": log_events,
187 |             "resource_information": resource_information,
188 |             "resource_information_object": resource_information_object,
189 |             "notifications": None,
190 |             "widget_images": widget_images,
191 |             "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
192 |             "trace_summary": trace_summary,
193 |             "trace": trace
194 |         }            
195 |     elif metric_name:
196 |         dashboard_metrics = [
197 |             {
198 |                 "title": "Invocations",
199 |                 "view": "timeSeries",
200 |                 "stacked": False,
201 |                 "stat": "Sum",
202 |                 "period": 60,
203 |                 "metrics": [
204 |                     [namespace, "Invocations"],
205 |                 ]
206 |             },
207 |             {
208 |                 "title": "Duration",
209 |                 "view": "timeSeries",
210 |                 "stacked": False,
211 |                 "stat": "Average",
212 |                 "period": 60,
213 |                 "metrics": [
214 |                     [namespace, "Duration"]
215 |                 ]
216 |             },
217 |             {
218 |                 "title": "Errors",
219 |                 "view": "timeSeries",
220 |                 "stacked": False,
221 |                 "stat": "Sum",
222 |                 "period": 60,
223 |                 "metrics": [
224 |                     [namespace, "Errors"]
225 |                 ]
226 |             },
227 |             {
228 |                 "title": "Throttles",
229 |                 "view": "timeSeries",
230 |                 "stacked": False,
231 |                 "stat": "Sum",
232 |                 "period": 60,
233 |                 "metrics": [
234 |                     [namespace, "Throttles"]
235 |                 ]
236 |             }
237 |         ]
238 |         widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
239 |         additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)     
240 |         log_information = None
241 |         log_events = None        
242 |         trace_summary = None
243 |         trace = None        
244 |         resource_information = None
245 |         resource_information_object = None        
246 |     else:
247 |         contextual_links = None
248 |         log_information = None
249 |         log_events = None
250 |         resource_information = None
251 |         resource_information_object = None
252 |         widget_images = None
253 |         additional_metrics_with_timestamps_removed = None
254 |         trace_summary = None
255 |         trace = None
256 |         notifications = None
257 |     return {
258 |         "contextual_links": contextual_links,
259 |         "log_information": log_information,
260 |         "log_events": log_events,
261 |         "resource_information": None,
262 |         "resource_information_object": None,
263 |         "notifications": None,
264 |         "widget_images": widget_images,
265 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
266 |         "trace_summary": None,
267 |         "trace": None
268 |     }            
269 | 


--------------------------------------------------------------------------------
/alarm_context_tool/region_lookup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | import dns.resolver
 4 | from aws_lambda_powertools import Logger
 5 | from aws_lambda_powertools import Tracer
 6 | logger = Logger()
 7 | tracer = Tracer()
 8 | 
 9 | class RegionLookupError(Exception):
10 |     """Raised when there was a problem when looking up the active region"""
11 |     pass
12 | 
13 | @tracer.capture_method
14 | def active_region():
15 |     qname = 'global.health.amazonaws.com'
16 |     try:
17 |         answers = dns.resolver.resolve(qname, 'CNAME')
18 |     except Exception as e:
19 |         raise RegionLookupError('Failed to resolve {}'.format(qname), e)
20 |     if len(answers) != 1:
21 |         raise RegionLookupError('Failed to get a single answer when resolving {}'.format(qname))
22 |     name = str(answers[0].target) # e.g. health.us-east-1.amazonaws.com.
23 |     region_name = name.split('.')[1] # Region name is the 1st in split('.') -> ['health', 'us-east-1', 'amazonaws', 'com', '']
24 |     return region_name


--------------------------------------------------------------------------------
/alarm_context_tool/s3_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | import datetime
  4 | 
  5 | from functions import get_dashboard_button
  6 | from functions import get_html_table
  7 | from functions_logs import get_last_10_events
  8 | from functions_logs import get_log_insights_link
  9 | from functions_xray import process_traces
 10 | from functions_metrics import build_dashboard
 11 | from functions_metrics import get_metrics_from_dashboard_metrics
 12 | from functions import get_information_panel
 13 | 
 14 | from aws_lambda_powertools import Logger
 15 | from aws_lambda_powertools import Tracer
 16 | logger = Logger()
 17 | tracer = Tracer()
 18 | 
 19 | 
 20 | @tracer.capture_method
 21 | def get_storage_metrics(namespace, bucket_name):
 22 | 
 23 |     metrics = [("NumberOfObjects", "Average", 3600),
 24 |                ("BucketsSizeBytes", "Average", 3600)]
 25 | 
 26 |     metric_list = []
 27 |     for (metric, stat, period) in metrics:
 28 |         metric_list = metric_list + [{
 29 |             "title": metric,
 30 |             "view": "timeSeries",
 31 |             "stacked": False,
 32 |             "stat": stat,
 33 |             "period": period,
 34 |             "metrics": [
 35 |                 [namespace, metric, "BucketName", bucket_name]
 36 |             ]
 37 |         }]
 38 | 
 39 |     return metric_list
 40 | 
 41 | 
 42 | @tracer.capture_method
 43 | def get_request_metrics(namespace, bucket_name, filter_id):
 44 |     metrics = [("AllRequests", "Sum", 60),
 45 |                ("GetRequests", "Sum", 60),
 46 |                ("PutRequests", "Sum", 60),
 47 |                ("DeleteRequests", "Sum", 60),
 48 |                ("HeadRequests", "Sum", 60),
 49 |                ("PostRequests", "Sum", 60),
 50 |                ("ListRequests", "Sum", 60),
 51 |                ("BytesDownloaded", "Sum", 60),
 52 |                ("BytesUploaded", "Sum", 60),
 53 |                ("4xxErrors", "Sum", 60),
 54 |                ("5xxErrors", "Sum", 60),
 55 |                ("FirstByteLatency", "Average", 60),
 56 |                ("TtoalRequestLatency", "Average", 60)]
 57 | 
 58 |     metric_list = []
 59 |     for (metric, stat, period) in metrics:
 60 |         metric_list = metric_list + [{
 61 |             "title": metric,
 62 |             "view": "timeSeries",
 63 |             "stacked": False,
 64 |             "stat": stat,
 65 |             "period": period,
 66 |             "metrics": [
 67 |                 [namespace, metric, "BucketName",
 68 |                     bucket_name, "FilterId", filter_id]
 69 |             ]
 70 |         }]
 71 | 
 72 |     return metric_list
 73 | 
 74 | 
 75 | @tracer.capture_method
 76 | def get_replication_metrics(namespace, source_bucket, destination_bucket, rule_id):
 77 | 
 78 |     metrics = [("OperationsFailedReplication", "Sum", 60),
 79 |                ("OperationsPendingReplication", "Maximum", 60),
 80 |                ("ReplicationLatency", "Maximum", 60),
 81 |                ("BytesPendingReplication", "Maximum", 60)]
 82 | 
 83 |     metric_list = []
 84 |     for (metric, stat, period) in metrics:
 85 |         metric_list = metric_list + [{
 86 |             "title": metric,
 87 |             "view": "timeSeries",
 88 |             "stacked": False,
 89 |             "stat": stat,
 90 |             "period": period,
 91 |             "metrics": [
 92 |                 [namespace, metric, "SourceBucket", source_bucket,
 93 |                     "DestinationBucket", destination_bucket, "RuleId", rule_id]
 94 |             ]
 95 |         }]
 96 | 
 97 |     return metric_list
 98 | 
 99 | 
100 | @tracer.capture_method
101 | def get_storage_lens_metrics(namespace, bucket_name, aws_account_number, aws_region, configuration_id):
102 | 
103 |     metrics = [("StorageBytes", "Sum", 86400),
104 |                ("SelectScannedBytes", "Sum", 86400),
105 |                ("SelectReturnedBytes", "Sum", 86400),
106 |                ("SelectRequests", "Sum", 86400),
107 |                ("ReplicatedStorageBytesSource", "Sum", 86400),
108 |                ("ReplicatedStorageBytes", "Sum", 86400),
109 |                ("PutRequests", "Sum", 86400),
110 |                ("PostRequests", "Sum", 86400),
111 |                ("ObjectCount", "Sum", 86400),
112 |                ("NonCurrentVersionStorageBytes", "Sum", 86400),
113 |                ("ListRequests", "Sum", 86400),
114 |                ("IncompleteMultipartUploadStorageBytes", "Sum", 86400),
115 |                ("IncompleteMPUStorageBytesOlderThan7Days", "Sum", 86400),
116 |                ("EncryptedStorageBytes", "Sum", 86400),
117 |                ("UnencryptedStorageBytes", "Sum", 86400),
118 |                ("HeadRequests", "Sum", 86400),
119 |                ("GetRequests", "Sum", 86400),
120 |                ("DeleteRequests", "Sum", 86400),
121 |                ("DeleteMarkerStorageBytes", "Sum", 86400),
122 |                ("CurrentVersionStorageBytes", "Sum", 86400),
123 |                ("BytesUploaded", "Sum", 86400),
124 |                ("BytesDownloaded", "Sum", 86400),
125 |                ("AllRequests", "Sum", 86400),
126 |                ("AllUnsuportedSignatureRequests", "Sum", 86400),
127 |                ("AllUnsupportedTLSRequests", "Sum", 86400),
128 |                ("AllSSEKMSRequests", "Sum", 86400),
129 |                ("5xxErrors", "Sum", 86400),
130 |                ("4xxErrors", "Sum", 86400),
131 |                ("200OKStatusCount", "Sum", 86400)
132 |                ]
133 | 
134 |     metric_list = []
135 |     for (metric, stat, period) in metrics:
136 |         metric_list = metric_list + [{
137 |             "title": metric,
138 |             "view": "timeSeries",
139 |             "stacked": False,
140 |             "stat": stat,
141 |             "period": period,
142 |             "metrics": [
143 |                 [namespace, metric, "bucket_name", bucket_name, "aws_account_number",
144 |                     aws_account_number, "aws_region", aws_region, "configuration_id", configuration_id],
145 |             ]
146 |         }]
147 | 
148 |     return metric_list
149 | 
150 | 
151 | @tracer.capture_method
152 | def process_s3(metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
153 | 
154 |     # S3 Automatic Dashboard
155 |     link = f'https://{region}.console.aws.amazon.com/cloudwatch/home?region={
156 |         region}#home:dashboards/S3?~(globalLegendEnabled~true)'
157 |     contextual_links = get_dashboard_button('S3 automatic dashboard', link)
158 | 
159 |     # # Initialize variables
160 |     resource_information = ""
161 |     resource_information_object = {}
162 |     widget_images = []
163 |     additional_metrics_with_timestamps_removed = []
164 |     notifications = ""
165 |     log_information = None
166 |     log_events = None
167 |     trace_summary = None
168 |     trace = None
169 |     notifications = None
170 |     tags = None
171 | 
172 |     if dimensions:
173 |         dimension_values = {element['name']: element['value']
174 |                             for element in dimensions}
175 | 
176 |         # Possible Dimensions:
177 |         # https://docs.aws.amazon.com/AmazonS3/latest/userguide/metrics-dimensions.html#s3-cloudwatch-dimensions
178 |         # https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-lens-cloudwatch-metrics-dimensions.html
179 |         # Either the dimensions will be:
180 |         # 1) bucket_name
181 |         # 2) bucket_name AND filter_id
182 |         # 3) source_buccket AND destinaiton_bucket AND rule_id
183 |         # 4) bucket_name AND aws_account_number AND aws_region AND configuration_id
184 |         if dimension_values.get('BucketName'):
185 |             bucket_name = dimension_values.get('BucketName')
186 |         elif dimension_values.get('bucket_name'):
187 |             bucket_name = dimension_values.get('bucket_name')
188 |         else:
189 |             bucket_name = None
190 |         filter_id = dimension_values.get('FilterId')
191 |         source_bucket = dimension_values.get('SourceBucket')
192 |         destination_bucket = dimension_values.get('DestinationBucket')
193 |         rule_id = dimension_values.get('RuleId')
194 |         aws_account_number = dimension_values.get('aws_account_number')
195 |         aws_region = dimension_values.get('aws_region')
196 |         configuration_id = dimension_values.get('configuration_id')
197 | 
198 |         # Initializing local variables
199 |         dashboard_metrics = []
200 |         adjusted_start = start
201 |         adjusted_end = end
202 | 
203 |         # Retrieving storage metrics (always present)
204 |         if bucket_name:
205 |             dashboard_metrics = dashboard_metrics + \
206 |                 get_storage_metrics(namespace, bucket_name)
207 |         elif destination_bucket:
208 |             dashboard_metrics = dashboard_metrics + \
209 |                 get_storage_metrics(namespace, destination_bucket)
210 | 
211 |         # Retrieving request metrics (if applicable)
212 |         if bucket_name and filter_id:
213 |             dashboard_metrics = dashboard_metrics + \
214 |                 get_request_metrics(namespace, bucket_name, filter_id)
215 | 
216 |         # Retrieving replication metrics (if applicable)
217 |         if source_bucket and destination_bucket and rule_id:
218 |             dashboard_metrics = dashboard_metrics + \
219 |                 get_replication_metrics(
220 |                     namespace, source_bucket, destination_bucket, rule_id)
221 | 
222 |         # Retrieving storage lens metrics (if applicable)
223 |         # Adjusting the start and end times for the dashboard metric widgets since storage lens metrics once per day
224 |         if bucket_name and aws_account_number and aws_region and configuration_id:
225 |             dashboard_metrics = dashboard_metrics + get_storage_lens_metrics(
226 |                 namespace, bucket_name, aws_account_number, aws_region, configuration_id)
227 |             adjusted_start = change_time + datetime.timedelta(minutes=-5000)
228 |             adjusted_end = change_time + datetime.timedelta(minutes=100)
229 | 
230 |         widget_images.extend(build_dashboard(
231 |             dashboard_metrics, annotation_time, adjusted_start, adjusted_end, region))
232 |         additional_metrics_with_timestamps_removed.extend(
233 |             get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, adjusted_end, region))
234 | 
235 |     return {
236 |         "contextual_links": contextual_links,
237 |         "log_information": log_information,
238 |         "log_events": log_events,
239 |         "resource_information": resource_information,
240 |         "resource_information_object": resource_information_object,
241 |         "notifications": notifications,
242 |         "widget_images": widget_images,
243 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
244 |         "trace_summary": trace_summary,
245 |         "trace": trace,
246 |         "tags": tags
247 |     }
248 | 


--------------------------------------------------------------------------------
/alarm_context_tool/sns_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | from functions import get_dashboard_button
  5 | from functions import get_information_panel
  6 | from functions import get_html_table
  7 | from functions_logs import get_last_10_events
  8 | from functions_logs import get_log_insights_link
  9 | from functions_xray import generate_trace_html
 10 | from functions_logs import check_log_group_exists
 11 | from functions_metrics import build_dashboard
 12 | from functions_metrics import get_metrics_from_dashboard_metrics 
 13 | from functions_xray import process_traces
 14 | 
 15 | from aws_lambda_powertools import Logger
 16 | from aws_lambda_powertools import Tracer
 17 | logger = Logger()
 18 | tracer = Tracer()
 19 | 
 20 | @tracer.capture_method
 21 | def process_sns_topic(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 22 |     """
 23 |     Processes the given SNS topic message and generates additional information, log information, summary and widget images.
 24 |     
 25 |     Args:
 26 |     - message: The SNS topic message to process.
 27 |     - region: The AWS region where the SNS topic resides.
 28 |     - account_id: The AWS account ID where the SNS topic resides.
 29 |     - namespace: The CloudWatch namespace to query for metrics.
 30 |     - change_time: The time of the change in ISO format with timezone information.
 31 |     - annotation_time: The time to use as the annotation in ISO format with timezone information.
 32 |     - start_time: The start time of the query, in ISO format with timezone information.
 33 |     - end_time: The end time of the query, in ISO format with timezone information.
 34 |     - start: The start time of the dashboard, in ISO format with timezone information.
 35 |     - end: The end time of the dashboard, in ISO format with timezone information.
 36 | 
 37 |     Returns:
 38 |     - A dictionary
 39 |     """
 40 | 
 41 |     for elements in dimensions:
 42 |         if elements['name'] == 'TopicName':
 43 |             id = elements['value']
 44 |             link = 'https://%s.console.aws.amazon.com/sns/v3/home?region=%s#/topic/arn:aws:sns:%s:%s:%s' % (region, region, region, account_id, str(id))   
 45 |             contextual_links = get_dashboard_button("%s details" % (str(id)), link) 
 46 |             link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/SNS?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 47 |             contextual_links += get_dashboard_button("SNS in ALARM dashboard" , link)  
 48 |             
 49 |             # sns/us-east-2/012345678910/topic-name
 50 |             log_group_name = f"sns/{region}/{account_id}/{id}"
 51 |             if check_log_group_exists(log_group_name, region):
 52 |                 log_input = {"logGroupName": log_group_name}
 53 |                 log_information, log_events =  get_last_10_events(log_input, change_time, region) 
 54 |                 
 55 |                 # Log Insights Link
 56 |                 log_insights_query = """fields @timestamp, delivery.statusCode as code, status, delivery.attempts as attempts, notification.messageId as messageId,  @message
 57 |                     | sort @timestamp desc
 58 |                     | limit 100"""
 59 |                 log_insights_link = get_log_insights_link(log_input, log_insights_query, region, start_time, end_time)
 60 |                 contextual_links += get_dashboard_button("Log Insights" , log_insights_link) 
 61 |                 notifications = None
 62 |             else:
 63 |                 panel_title = "Your SNS topic is not writing logs to CloudWatch Logs"
 64 |                 panel_content = 'For additional information, configure SNS to log status to CloudWatch Logs. Follow the instructions <a href="https://docs.aws.amazon.com/sns/latest/dg/sms_stats_cloudwatch.html#sns-viewing-cloudwatch-logs" rel="noopener" target="_blank">here&nbsp;<span><img style="margin-bottom: -4px;" src="cid:imageId2"></span></a>'
 65 |                 log_information = None
 66 |                 log_events = None                
 67 |                 notifications = get_information_panel(panel_title, panel_content)                    
 68 |             
 69 |             dashboard_metrics = [    
 70 |                 {
 71 |                     "title": "Number Of Notifications Delivered: Sum",
 72 |                     "view": "timeSeries",
 73 |                     "stacked": False,
 74 |                     "stat": "Sum",
 75 |                     "period": 60,
 76 |                     "metrics": [
 77 |                         [ namespace, "NumberOfNotificationsDelivered", elements['name'], id]
 78 |                     ]
 79 |                 },
 80 |                 {
 81 |                     "title": "Number Of Notifications Failed: Sum",
 82 |                     "view": "timeSeries",
 83 |                     "stacked": False,
 84 |                     "stat": "Sum",
 85 |                     "period": 60,
 86 |                     "metrics": [
 87 |                         [ namespace, "NumberOfNotificationsFailed", elements['name'], id]
 88 |                     ]
 89 |                 }            
 90 |             ]
 91 |             
 92 |             widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region)   
 93 |             additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)
 94 | 
 95 |             
 96 |             # Get topic attributes
 97 |             sns = boto3.client('sns', region_name=region)        
 98 |             topic_arn = "arn:aws:sns:%s:%s:%s" % (region, account_id, str(id))
 99 |             
100 |             try:
101 |                 response = sns.get_topic_attributes(TopicArn=topic_arn)   
102 |             except botocore.exceptions.ClientError as error:
103 |                 logger.exception("Error getting topic attributes")
104 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
105 |             except botocore.exceptions.ParamValidationError as error:
106 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))
107 |                           
108 |             resource_information = get_html_table("SNS Topic: " +id, response['Attributes'])  
109 |             resource_information_object = response['Attributes']
110 |             
111 |             # Get Trace information
112 |             filter_expression = f'!OK and service(id(name: "{id}", type: "AWS::SNS::Topic")) AND service(id(account.id: "{account_id}"))'
113 |             logger.info("X-Ray Filter Expression", filter_expression=filter_expression)
114 |             trace_summary, trace = process_traces(filter_expression, region, start_time, end_time)
115 |             
116 |         else:
117 |             contextual_links = None
118 |             log_information = None
119 |             log_events = None
120 |             resource_information = None
121 |             resource_information_object = None
122 |             widget_images = None
123 |             additional_metrics_with_timestamps_removed = None
124 |             trace_summary = None
125 |             trace = None
126 |             notifications = None
127 |     
128 |     return {
129 |         "contextual_links": contextual_links,
130 |         "log_information": log_information,
131 |         "log_events": log_events,
132 |         "resource_information": resource_information,
133 |         "resource_information_object": resource_information_object,
134 |         "notifications": notifications,
135 |         "widget_images": widget_images,
136 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
137 |         "trace_summary": trace_summary,
138 |         "trace": trace
139 |     }        


--------------------------------------------------------------------------------
/alarm_context_tool/ssm_run_command_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore 
  3 | 
  4 | import datetime
  5 | 
  6 | from functions import get_dashboard_button
  7 | from functions import get_html_table_with_fields
  8 | from functions_metrics import build_dashboard
  9 | from functions_metrics import get_metrics_from_dashboard_metrics
 10 | 
 11 | from aws_lambda_powertools import Logger
 12 | from aws_lambda_powertools import Tracer
 13 | logger = Logger()
 14 | tracer = Tracer()
 15 | 
 16 | @tracer.capture_method
 17 | def process_ssm_run_command(metric_name, dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 18 |     
 19 |     if metric_name in ["CommandsDeliveryTimedOut", "CommandsFailed"]:
 20 |         link = 'https://%s.console.aws.amazon.com/systems-manager/run-command/complete-commands?region=%s' % (region, region)   
 21 |         contextual_links = get_dashboard_button("SSM Run Commmand", link) 
 22 |         link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/SSM-RunCommand?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 23 |         contextual_links += get_dashboard_button("SSM Run Command in ALARM dashboard", link)
 24 | 
 25 |         dashboard_metrics = []
 26 |         for metric in ["CommandsDeliveryTimedOut", "CommandsFailed", "CommandsSucceeded"]:
 27 |             if metric not in metric_name:
 28 |                 dashboard_metrics.append(
 29 |                     {
 30 |                         "title": metric,
 31 |                         "view": "timeSeries",
 32 |                         "stacked": False,
 33 |                         "stat": "Sum",
 34 |                         "period": 60,
 35 |                         "metrics": [
 36 |                             ["AWS/SSM-RunCommand", metric]
 37 |                         ]
 38 |                     }
 39 |                 )
 40 |         widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
 41 |         additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region) 
 42 |                 
 43 |         # SSM Client
 44 |         ssm_client = boto3.client('ssm', region_name=region)
 45 | 
 46 |         # Date formats required for filters
 47 |         change_time_str = change_time.astimezone(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
 48 |         start_time_str = start.astimezone(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')    
 49 | 
 50 |         # Failed Commands
 51 |         try:
 52 |             paginator = ssm_client.get_paginator('list_commands')
 53 |             commands_list = []
 54 |             for page in paginator.paginate(
 55 |                 Filters=[
 56 |                     {'key': 'Status', 'value': 'Failed'},
 57 |                     {'key': 'InvokedBefore', 'value': change_time_str},
 58 |                     {'key': 'InvokedAfter', 'value': start_time_str}
 59 |                 ]               
 60 |             ):
 61 |                 commands_list.extend(page['Commands'])
 62 |             response_failed = {'Commands': commands_list} 
 63 | 
 64 |         except botocore.exceptions.ClientError as error:
 65 |             logger.exception("Error getting failed SSM commands")
 66 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 67 |         except botocore.exceptions.ParamValidationError as error:
 68 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error))
 69 | 
 70 |         # Timed Out Commands
 71 |         try:
 72 |             paginator = ssm_client.get_paginator('list_commands')
 73 |             commands_list = []
 74 |             for page in paginator.paginate(
 75 |                 Filters=[
 76 |                     {'key': 'Status', 'value': 'TimedOut'},
 77 |                     {'key': 'InvokedBefore', 'value': change_time_str},
 78 |                     {'key': 'InvokedAfter', 'value': start_time_str}
 79 |                 ]               
 80 |             ):
 81 |                 commands_list.extend(page['Commands'])
 82 |             response_timed_out = {'Commands': commands_list}             
 83 |         except botocore.exceptions.ClientError as error:
 84 |             logger.exception("Error getting timed out SSM commands")
 85 |             raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
 86 |         except botocore.exceptions.ParamValidationError as error:
 87 |             raise ValueError('The parameters you provided are incorrect: {}'.format(error)) 
 88 | 
 89 |         # Add commands together  
 90 |         commands = response_failed['Commands'] + response_timed_out['Commands']
 91 | 
 92 |         items_list = []
 93 |         for command in commands:
 94 |             
 95 |             command_id = command.get('CommandId', '')
 96 |             command_link = 'https://%s.console.aws.amazon.com/systems-manager/run-command/%s?region=%s'  % (region, command_id, region)
 97 |             document_name = command.get('DocumentName', '')
 98 |             status = command.get('Status', '')
 99 |             requested_datetime = command.get('RequestedDateTime', '').strftime('%Y-%m-%d %H:%M:%S')
100 | 
101 |             items_list.append({'Command ID': {'value': command_id, 'link': command_link}, 'Document Name': document_name, 'Status': status, 'Requested Date Time': requested_datetime})
102 |         
103 |         fields = ['Command ID', 'Document Name', 'Status', 'Requested Date Time']
104 |         log_information = get_html_table_with_fields('SSM Failed or Timed Out Command Invocations', items_list, fields)
105 |         log_events = items_list
106 |     elif metric_name == "CommandsSucceeded":
107 |         # Add Code here if you have an alarm for a successful run
108 |         logger.info("There is no code to deal with an alarm associated with a successful SSM command.")
109 |     else:
110 |         contextual_links = None
111 |         log_information = None
112 |         log_events = None
113 |         resource_information = None
114 |         resource_information_object = None
115 |         widget_images = None
116 |         additional_metrics_with_timestamps_removed = None
117 |         trace_summary = None
118 |         trace = None
119 |         notifications = None    
120 |     return {
121 |         "contextual_links": contextual_links,
122 |         "log_information": log_information,
123 |         "log_events": log_events,
124 |         "resource_information": None,
125 |         "resource_information_object": None,
126 |         "notifications": None,
127 |         "widget_images": widget_images,
128 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
129 |         "trace_summary": None,
130 |         "trace": None
131 |     } 


--------------------------------------------------------------------------------
/alarm_context_tool/synthetics_handler.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | 
  4 | from datetime import timedelta
  5 | 
  6 | from functions import get_dashboard_button
  7 | from functions import get_html_table
  8 | from functions_logs import get_last_10_events
  9 | from functions_logs import get_log_insights_link
 10 | from functions_metrics import build_dashboard
 11 | from functions_metrics import get_metrics_from_dashboard_metrics 
 12 | from functions_xray import process_traces
 13 | 
 14 | from aws_lambda_powertools import Logger
 15 | from aws_lambda_powertools import Tracer
 16 | logger = Logger()
 17 | tracer = Tracer()
 18 | 
 19 | @tracer.capture_method
 20 | def process_synthetics(dimensions, region, account_id, namespace, change_time, annotation_time, start_time, end_time, start, end):
 21 |     for elements in dimensions:
 22 |         if elements['name'] == 'CanaryName':
 23 |             id = elements['value']
 24 |             link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#synthetics:canary/detail/%s' % (region, region, str(id))   
 25 |             contextual_links = get_dashboard_button("%s details" % (str(id)), link) 
 26 |             link = 'https://%s.console.aws.amazon.com/cloudwatch/home?region=%s#home:dashboards/CloudWatchSynthetics?~(alarmStateFilter~(~\'ALARM))' % (region, region)   
 27 |             contextual_links += get_dashboard_button("Canaries in ALARM dashboard", link)
 28 | 
 29 |             dashboard_metrics = [
 30 |                 {
 31 |                     "title": "Duration",
 32 |                     "view": "singleValue",
 33 |                     "stacked": False,
 34 |                     "stat": "Average",
 35 |                     "period": 60,
 36 |                     "metrics": [
 37 |                         [namespace, "Duration", 'CanaryName', id]
 38 |                     ]
 39 |                 },
 40 |                 {
 41 |                     "title": "Failed",
 42 |                     "view": "singleValue",
 43 |                     "stacked": False,
 44 |                     "stat": "Sum",
 45 |                     "period": 60,
 46 |                     "metrics": [
 47 |                         [namespace, "Failed", 'CanaryName', id]
 48 |                     ]
 49 |                 },
 50 |                 {
 51 |                     "title": "4xx",
 52 |                     "view": "singleValue",
 53 |                     "stacked": False,
 54 |                     "stat": "Sum",
 55 |                     "period": 60,
 56 |                     "metrics": [
 57 |                         [namespace, "4xx", 'CanaryName', id]
 58 |                     ]
 59 |                 },
 60 |                 {
 61 |                     "title": "5xx",
 62 |                     "view": "singleValue",
 63 |                     "stacked": False,
 64 |                     "stat": "Sum",
 65 |                     "period": 60,
 66 |                     "metrics": [
 67 |                         [namespace, "5xx", 'CanaryName', id]
 68 |                     ]
 69 |                 },
 70 |                 {
 71 |                     "title": "Duration",
 72 |                     "view": "timeSeries",
 73 |                     "stacked": False,
 74 |                     "stat": "Average",
 75 |                     "period": 60,
 76 |                     "metrics": [
 77 |                         [namespace, "Duration", 'CanaryName', id]
 78 |                     ]
 79 |                 },
 80 |                 {
 81 |                     "title": "SuccessPercent",
 82 |                     "view": "timeSeries",
 83 |                     "stacked": False,
 84 |                     "stat": "Average",
 85 |                     "period": 60,
 86 |                     "metrics": [
 87 |                         [namespace, "SuccessPercent", 'CanaryName', id]
 88 |                     ]
 89 |                 },
 90 |                 {
 91 |                     "title": "2xx",
 92 |                     "view": "timeSeries",
 93 |                     "stacked": False,
 94 |                     "stat": "Sum",
 95 |                     "period": 60,
 96 |                     "metrics": [
 97 |                         [namespace, "2xx", 'CanaryName', id]
 98 |                     ]
 99 |                 },
100 |                 {
101 |                     "title": "4xx",
102 |                     "view": "timeSeries",
103 |                     "stacked": False,
104 |                     "stat": "Sum",
105 |                     "period": 60,
106 |                     "metrics": [
107 |                         [namespace, "4xx", 'CanaryName', id]
108 |                     ]
109 |                 },
110 |                 {
111 |                     "title": "5xx",
112 |                     "view": "timeSeries",
113 |                     "stacked": False,
114 |                     "stat": "Sum",
115 |                     "period": 60,
116 |                     "metrics": [
117 |                         [namespace, "5xx", 'CanaryName', id]
118 |                     ]
119 |                 },
120 |                 {
121 |                     "title": "Failed",
122 |                     "view": "timeSeries",
123 |                     "stacked": False,
124 |                     "stat": "Sum",
125 |                     "period": 60,
126 |                     "metrics": [
127 |                         [namespace, "Failed", 'CanaryName', id]
128 |                     ]
129 |                 }
130 |             ]
131 |             widget_images = build_dashboard(dashboard_metrics, annotation_time, start, end, region) 
132 |             additional_metrics_with_timestamps_removed = get_metrics_from_dashboard_metrics(dashboard_metrics, change_time, end, region)
133 |             
134 |             synthetics = boto3.client('synthetics', region_name=region) 
135 |             
136 |             # Describe Canaries
137 |             try:
138 |                 response = synthetics.describe_canaries(Names=[id])
139 |             except botocore.exceptions.ClientError as error:
140 |                 logger.exception("Error describing canaries")
141 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
142 |             except botocore.exceptions.ParamValidationError as error:
143 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))
144 |             
145 |             resource_information = get_html_table("Canary: " +id, response['Canaries'][0])  
146 |             
147 |             # Get information from EngineARN
148 |             logger.info("Canary Lambda Function ARN", engine_arn=response['Canaries'][0]['EngineArn'])
149 |             engine_arn = response['Canaries'][0]['EngineArn'].split(':')
150 |             log_group_name = '/aws/lambda/' +engine_arn[6]
151 |             
152 |             log_input = {"logGroupName": log_group_name}
153 |             log_information, log_events =  get_last_10_events(log_input, change_time, region) 
154 |             
155 |             # Log Insights Link
156 |             log_insights_query = """fields @timestamp, @message
157 |                 | sort @timestamp desc
158 |                 | limit 200"""
159 |             log_insights_link = get_log_insights_link(log_input, log_insights_query, region, start_time, end_time)
160 |             contextual_links += get_dashboard_button("Log Insights" , log_insights_link)                 
161 | 
162 |                       
163 |             # Describe last run
164 |             try:
165 |                 response = synthetics.get_canary_runs(
166 |                     Name=id,
167 |                     MaxResults=10
168 |                 ) 
169 |             except botocore.exceptions.ClientError as error:
170 |                 logger.exception("Error getting canary runs")
171 |                 raise RuntimeError(f"Unable to fullfil request error encountered as : {error}") from error  
172 |             except botocore.exceptions.ParamValidationError as error:
173 |                 raise ValueError('The parameters you provided are incorrect: {}'.format(error))            
174 |    
175 |             
176 |             # Initialize a variable to store the selected run
177 |             selected_run = None
178 |             
179 |             # Loop through the list of CanaryRuns
180 |             for run in response.get("CanaryRuns", []):
181 |                 run_status = run.get("Status", {}).get("State")
182 |             
183 |                 # Check if the run status is "FAILED"
184 |                 if run_status == "FAILED":
185 |                     selected_run = run
186 |                     break  # Stop at the first FAILED run
187 |             
188 |             # If no failed run was found, select the first run
189 |             if selected_run is None and response.get("CanaryRuns"):
190 |                 selected_run = response["CanaryRuns"][0]
191 |             
192 |             # Log the selected Canary Run (you can access its details using selected_run)
193 |             logger.info("Last 10 Canary Runs", extra=response)
194 |             logger.info("Selected Canary Run", selected_run=selected_run)
195 | 
196 |             resource_information += get_html_table("Last Canary Run for " +id, selected_run)
197 |             resource_information_object = selected_run
198 |             
199 |             
200 |             # Get Trace information
201 | 
202 |             # Define the original time range for the query based on the canary run data
203 |             original_start_time = selected_run['Timeline']['Started']
204 |             original_end_time = selected_run['Timeline']['Completed']
205 |             
206 |             # Adjust the start and end times to include a 5-minute buffer
207 |             trace_start_time = original_start_time - timedelta(minutes=5)
208 |             trace_end_time = original_end_time + timedelta(minutes=5)
209 | 
210 |             # Format datetime objects into strings for logging or API calls
211 |             trace_start_time_str = trace_start_time.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'  # Example formatting
212 |             trace_end_time_str = trace_end_time.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'  # Example formatting
213 | 
214 |             # Canary run ID
215 |             canary_run_id = selected_run['Id']
216 |             
217 |             # Define the X-ray filter expression using the canary run ID
218 |             filter_expression = f'annotation.aws:canary_run_id = "{canary_run_id}" and responsetime > 0'
219 |             logger.info("X-Ray Filter Expression", filter_expression=filter_expression)            
220 |             trace_summary, trace = process_traces(filter_expression, region, trace_start_time_str, trace_end_time_str)
221 | 
222 |         else:
223 |             contextual_links = None
224 |             log_information = None
225 |             log_events = None
226 |             resource_information = None
227 |             resource_information_object = None
228 |             widget_images = None
229 |             additional_metrics_with_timestamps_removed = None
230 |             trace_summary = None
231 |             trace = None
232 |             notifications = None
233 |     
234 |     return {
235 |         "contextual_links": contextual_links,
236 |         "log_information": log_information,
237 |         "log_events": log_events,
238 |         "resource_information": resource_information,
239 |         "resource_information_object": resource_information_object,
240 |         "notifications": None,
241 |         "widget_images": widget_images,
242 |         "additional_metrics_with_timestamps_removed": additional_metrics_with_timestamps_removed,
243 |         "trace_summary": trace_summary,
244 |         "trace": trace
245 |     }


--------------------------------------------------------------------------------
/dependencies_layer/requirements.txt:
--------------------------------------------------------------------------------
1 | markdown
2 | boto3
3 | pandas
4 | dnspython
5 | PyYAML
6 | cfn_flip


--------------------------------------------------------------------------------
/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default.deploy.parameters]
 3 | stack_name = "alarm-context-tool"
 4 | resolve_s3 = true
 5 | s3_prefix = "alarm-context-tool"
 6 | region = "us-east-2"
 7 | confirm_changeset = true
 8 | capabilities = "CAPABILITY_IAM"
 9 | image_repositories = []
10 | [default.remote_invoke.parameters]
11 | stack_name = "alarm-context-tool"


--------------------------------------------------------------------------------
/template.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | Transform: AWS::Serverless-2016-10-31
  3 | Description: A Lambda function to add context to CloudWatch Alarms
  4 | Resources:
  5 | 
  6 |   AlarmContextToolSNSTopic:
  7 |     Type: "AWS::SNS::Topic"
  8 |     Properties:
  9 |       DisplayName: "Alarm Context DLQ"
 10 |       TopicName: "AlarmContextToolDLQ"
 11 |       KmsMasterKeyId: alias/aws/sns
 12 | 
 13 |   DependenciesLayer:
 14 |     Type: AWS::Serverless::LayerVersion
 15 |     Properties:
 16 |       ContentUri: dependencies_layer/
 17 |       CompatibleRuntimes:
 18 |         - python3.12
 19 |     Metadata:
 20 |       BuildMethod: python3.12
 21 |       BuildArchitecture: x86_64
 22 | 
 23 |   AlarmContextFunction:
 24 |     # checkov:skip=CKV_AWS_117:The Lambda function needs to access resources over the Internet
 25 |     # checkov:skip=CKV_AWS_173:Environment variables do not contain sensitive data
 26 |     Metadata:
 27 |       cfn_nag:
 28 |         rules_to_suppress:
 29 |           - id: W11
 30 |             reason: "Wildcards are required for permissions to apply read permissions to any resource"
 31 |           - id: W89
 32 |             reason: "The Lambda function needs to access resources over the Internet"     
 33 |     Type: AWS::Serverless::Function
 34 |     Properties:
 35 |       CodeUri: alarm_context_tool
 36 |       DeadLetterQueue:
 37 |         Type: SNS
 38 |         TargetArn: !GetAtt AlarmContextToolSNSTopic.TopicArn
 39 |       Description: >-
 40 |         A Lambda function to add context to CloudWatch Alarms
 41 |       MemorySize: 1024
 42 |       Timeout: 900
 43 |       Handler: lambda_function.alarm_handler
 44 |       Runtime: python3.12
 45 |       Architectures:
 46 |         - x86_64
 47 |       EphemeralStorage:
 48 |         Size: 512
 49 |       Environment:
 50 |         Variables:
 51 |           AWS_LAMBDA_LOG_LEVEL: INFO
 52 |           ANTHROPIC_VERSION: bedrock-2023-05-31
 53 |           BEDROCK_MODEL_ID: anthropic.claude-3-sonnet-20240229-v1:0
 54 |           BEDROCK_REGION: us-east-1
 55 |           BEDROCK_MAX_TOKENS: 4000
 56 |           METRIC_ROUNDING_PRECISION_FOR_BEDROCK: 3
 57 |           POWERTOOLS_LOG_LEVEL: INFO
 58 |           POWERTOOLS_LOGGER_LOG_EVENT: "True"
 59 |           POWERTOOLS_SERVICE_NAME: Alarm
 60 |           POWERTOOLS_TRACER_CAPTURE_RESPONSE: "False"
 61 |           RECIPIENT: alias@domain.com
 62 |           SENDER: name <alias@domain.com>
 63 |           USE_BEDROCK: "True"    
 64 |       EventInvokeConfig:
 65 |         MaximumEventAgeInSeconds: 21600
 66 |         MaximumRetryAttempts: 2
 67 |       Layers:
 68 |         - !Sub arn:aws:lambda:${AWS::Region}:580247275435:layer:LambdaInsightsExtension:49
 69 |         - !Sub arn:aws:lambda:${AWS::Region}:017000801446:layer:AWSLambdaPowertoolsPythonV2:71
 70 |         - !Ref DependenciesLayer
 71 |       PackageType: Zip
 72 |       Policies:
 73 |         - Statement:
 74 |             - Effect: Allow
 75 |               Action:
 76 |                 - ec2:DescribeInstances
 77 |                 - ec2:DescribeImages
 78 |               Resource: "*"
 79 |         - Statement:
 80 |             - Effect: Allow
 81 |               Action:        
 82 |                 - sns:GetTopicAttributes
 83 |               Resource: "*"
 84 |         - Statement:
 85 |             - Effect: Allow
 86 |               Action:              
 87 |                 - synthetics:DescribeCanariesLastRun
 88 |                 - synthetics:DescribeCanaries
 89 |                 - synthetics:GetCanaryRuns
 90 |               Resource: "*" 
 91 |         - Statement:
 92 |             - Effect: Allow
 93 |               Action:
 94 |                 - elasticloadbalancing:DescribeLoadBalancers
 95 |                 - elasticloadbalancing:DescribeTags
 96 |                 - elasticloadbalancing:DescribeTargetGroups 
 97 |               Resource: "*"
 98 |         - Statement:
 99 |             - Effect: Allow
100 |               Action:               
101 |                 - logs:DescribeLogGroups
102 |                 - logs:DescribeLogStreams
103 |                 - logs:GetLogEvents
104 |                 - logs:FilterLogEvents                
105 |                 - logs:FilterLogEvents
106 |               Resource: "*"
107 |         - Statement:
108 |             - Effect: Allow
109 |               Action:
110 |                 - lambda:GetFunction
111 |               Resource: "*"
112 |         - Statement:
113 |             - Effect: Allow
114 |               Action:
115 |                 - ses:SendRawEmail
116 |               Resource: "*"
117 |         - Statement:
118 |             - Effect: Allow
119 |               Action:
120 |                 - ecs:DescribeClusters
121 |                 - ecs:DescribeTaskDefinition
122 |                 - ecs:DescribeServices
123 |               Resource: "*"
124 |         - Statement:
125 |             - Effect: Allow
126 |               Action:
127 |                 - ssm:DescribeInstanceInformation
128 |               Resource: "*"
129 |         - Statement:
130 |             - Effect: Allow
131 |               Action:
132 |                 - dynamodb:DescribeTable
133 |                 - dynamodb:ListTagsOfResource
134 |               Resource: "*"
135 |         - Statement:
136 |             - Effect: Allow
137 |               Action:
138 |                 - autoscaling:DescribeAutoScalingGroups
139 |               Resource: "*"
140 |         - Statement:
141 |             - Effect: Allow
142 |               Action:
143 |                 - cloudwatch:GetMetricWidgetImage
144 |                 - cloudwatch:GetMetricData
145 |                 - cloudwatch:DescribeAlarmHistory
146 |               Resource: "*"
147 |         - Statement:
148 |             - Effect: Allow
149 |               Action:
150 |                 - xray:GetTraceSummaries
151 |                 - xray:BatchGetTraces
152 |               Resource: "*"
153 |         - Statement:
154 |             - Effect: Allow
155 |               Action:
156 |                 - ssm:ListCommands
157 |               Resource: "*" 
158 |         - Statement:
159 |             - Effect: Allow
160 |               Action:    
161 |                 - health:DescribeEvents
162 |                 - health:DescribeEventDetails
163 |               Resource: "*" 
164 |         - Statement:
165 |             - Effect: Allow
166 |               Action:   
167 |                 - cloudformation:GetTemplate
168 |               Resource: "*"  
169 |         - Statement:
170 |             - Effect: Allow
171 |               Action:  
172 |                 - rds:DescribeDBClusters
173 |                 - rds:DescribeDBInstances
174 |               Resource: "*"
175 |         - Statement:
176 |             - Effect: Allow
177 |               Action:
178 |                 - pi:ListAvailableResourceMetrics
179 |                 - pi:GetResourceMetrics
180 |               Resource: "*"
181 |         - Statement:
182 |             - Effect: Allow
183 |               Action:
184 |                 - eks:DescribeCluster
185 |               Resource: "*"
186 |         - Statement:
187 |             - Effect: Allow
188 |               Action:
189 |                 - bedrock:InvokeModel
190 |               Resource: arn:*:bedrock:*::foundation-model/*
191 |         - Statement:
192 |             - Effect: Allow
193 |               Action: 
194 |                 - apigateway:GET
195 |               Resource:
196 |                 - arn:aws:apigateway:*::/apis/*/stages
197 |                 - arn:aws:apigateway:*::/apis/*/stages/*                
198 |                 - arn:aws:apigateway:*::/restapis
199 |                 - arn:aws:apigateway:*::/restapis/*   
200 |         - Statement:
201 |             - Effect: Allow
202 |               Action: 
203 |                 - sns:Publish
204 |               Resource: !GetAtt AlarmContextToolSNSTopic.TopicArn
205 |       ReservedConcurrentExecutions: 10        
206 |       Tracing: Active
207 |       LoggingConfig:
208 |         LogFormat: JSON 
209 | 


--------------------------------------------------------------------------------