├── src
    ├── layer
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── README.md
    │   └── common.py
    ├── cleanup_resources_function
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── README.md
    │   └── cleanup_resources.py
    ├── dashboard_mgmt_function
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── dashboard-template.mustache
    │   ├── README.md
    │   └── dashboard_mgmt.py
    ├── personalize_monitor_function
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── README.md
    │   └── personalize_monitor.py
    ├── personalize_delete_campaign_function
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── personalize_delete_campaign.py
    │   └── README.md
    └── personalize_update_campaign_tps_function
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── personalize_update_campaign_tps.py
    │   └── README.md
├── .gitignore
├── images
    ├── personalize-monitor-architecture.png
    ├── personalize-monitor-cloudwatch-alarms.png
    ├── personalize-monitor-cloudwatch-dashboard.png
    └── personalize-monitor-cloudwatch-metrics.png
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── samconfig.toml
├── sar-publish.sh
├── LICENSE
├── CONTRIBUTING.md
├── README-SAR.md
├── template.yaml
└── README.md


/src/layer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/cleanup_resources_function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/dashboard_mgmt_function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/personalize_monitor_function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/personalize_delete_campaign_function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/personalize_update_campaign_tps_function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_Store
3 | .vscode
4 | .aws-sam
5 | env


--------------------------------------------------------------------------------
/src/cleanup_resources_function/requirements.txt:
--------------------------------------------------------------------------------
1 | # Dependencies bundled in layer


--------------------------------------------------------------------------------
/src/personalize_monitor_function/requirements.txt:
--------------------------------------------------------------------------------
1 | # Dependencies bundled in layer


--------------------------------------------------------------------------------
/src/personalize_delete_campaign_function/requirements.txt:
--------------------------------------------------------------------------------
1 | # Dependencies bundled in layer


--------------------------------------------------------------------------------
/src/personalize_update_campaign_tps_function/requirements.txt:
--------------------------------------------------------------------------------
1 | # Dependencies bundled in layer


--------------------------------------------------------------------------------
/src/dashboard_mgmt_function/requirements.txt:
--------------------------------------------------------------------------------
1 | # Other dependencies bundled in layer
2 | chevron==0.13.1


--------------------------------------------------------------------------------
/src/layer/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-lambda-powertools==1.6.1
2 | crhelper==2.0.6
3 | expiring-dict==1.1.0


--------------------------------------------------------------------------------
/images/personalize-monitor-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-architecture.png


--------------------------------------------------------------------------------
/images/personalize-monitor-cloudwatch-alarms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-alarms.png


--------------------------------------------------------------------------------
/images/personalize-monitor-cloudwatch-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-dashboard.png


--------------------------------------------------------------------------------
/images/personalize-monitor-cloudwatch-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-metrics.png


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/src/cleanup_resources_function/README.md:
--------------------------------------------------------------------------------
1 | # Amazon Personalize Monitor - Cleanup Function
2 | 
3 | This Lambda function is called as a CloudFormation custom resource when the application is deleted/uninstalled so that resources created dynamically by the application, such as CloudWatch alarms, are also deleted.


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/samconfig.toml:
--------------------------------------------------------------------------------
1 | version = 0.1
2 | [default]
3 | [default.deploy]
4 | [default.deploy.parameters]
5 | stack_name = "personalize-monitor"
6 | s3_prefix = "personalize-monitor"
7 | parameter_overrides = "CampaignARNs=\"all\" AutoCreateCampaignUtilizationAlarms=\"Yes\" CampaignThresholdAlarmLowerBound=\"100\" AutoCreateIdleCampaignAlarms=\"Yes\""
8 | capabilities = "CAPABILITY_IAM"


--------------------------------------------------------------------------------
/src/layer/README.md:
--------------------------------------------------------------------------------
1 | # Amazon Personalize Monitor - Common Lambda Layer
2 | 
3 | This [Lambda Layer](https://docs.aws.amazon.com/lambda/latest/dg/configuration-layers.html) includes dependencies shared across all/most functions in this application. In addition, the [common.py](./common.py) file includes utility functions that are also shared across the Lambda functions in this application.


--------------------------------------------------------------------------------
/sar-publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Utility script to deploy application to the Serverless Application Repository.
 4 | 
 5 | set -e
 6 | 
 7 | # Bucket must have policy to allow SAR access.
 8 | # See https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-template-publishing-applications.html
 9 | BUCKET=$1
10 | REGION=$2
11 | 
12 | if [ "$BUCKET" == "" ] || [ "$REGION" == "" ]; then
13 |     echo "Usage: $0 BUCKET REGION"
14 |     echo "  where BUCKET is the S3 bucket to deploy packaged resources for SAR and REGION is the AWS region where to publish the application"
15 |     exit 1
16 | fi
17 | 
18 | echo "Building application"
19 | sam build --use-container --cached
20 | 
21 | cd .aws-sam/build
22 | echo "Packaging application"
23 | sam package --template-file template.yaml --output-template-file packaged.yaml --s3-bucket $BUCKET
24 | echo "Publishing application to the SAR"
25 | sam publish --template packaged.yaml --region $REGION
26 | cd -


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/src/cleanup_resources_function/cleanup_resources.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: MIT-0
 3 | 
 4 | """Cleans up resources created by this application outside of CloudFormation
 5 | 
 6 | This function is called as a CloudFormation custom resource.
 7 | """
 8 | 
 9 | from crhelper import CfnResource
10 | from aws_lambda_powertools import Logger
11 | 
12 | from common import (
13 |     PROJECT_NAME,
14 |     ALARM_NAME_PREFIX,
15 |     extract_region,
16 |     get_client,
17 |     determine_campaign_arns
18 | )
19 | 
20 | logger = Logger()
21 | helper = CfnResource()
22 | 
23 | @helper.delete
24 | def delete_resource(event, _):
25 |     campaign_arns = determine_campaign_arns(event.get('ResourceProperties'))
26 | 
27 |     logger.debug('Campaigns to check for resources to delete: %s', campaign_arns)
28 | 
29 |     regions = set()
30 | 
31 |     for campaign_arn in campaign_arns:
32 |         regions.add(extract_region(campaign_arn))
33 | 
34 |     logger.debug('Regions to check for resources to delete: %s', regions)
35 | 
36 |     alarms_deleted = 0
37 | 
38 |     for region in regions:
39 |         cw = get_client(service_name = 'cloudwatch', region_name = region)
40 | 
41 |         alarm_names_to_delete = set()
42 | 
43 |         alarms_paginator = cw.get_paginator('describe_alarms')
44 |         for alarms_page in alarms_paginator.paginate(AlarmNamePrefix = ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']):
45 |             for alarm in alarms_page['MetricAlarms']:
46 |                 tags_response = cw.list_tags_for_resource(ResourceARN = alarm['AlarmArn'])
47 | 
48 |                 for tag in tags_response['Tags']:
49 |                     if tag['Key'] == 'CreatedBy' and tag['Value'] == PROJECT_NAME:
50 |                         alarm_names_to_delete.add(alarm['AlarmName'])
51 |                         break
52 | 
53 |         if alarm_names_to_delete:
54 |             # FUTURE: max check of 100
55 |             logger.info('Deleting CloudWatch alarms in %s for campaigns %s: %s', region, campaign_arns, alarm_names_to_delete)
56 |             cw.delete_alarms(AlarmNames=list(alarm_names_to_delete))
57 |             alarms_deleted += len(alarm_names_to_delete)
58 | 
59 |     logger.info('Deleted %d alarms', alarms_deleted)
60 | 
61 | @logger.inject_lambda_context(log_event=True)
62 | def lambda_handler(event, context):
63 |     helper(event, context)


--------------------------------------------------------------------------------
/src/personalize_update_campaign_tps_function/personalize_update_campaign_tps.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: MIT-0
 3 | 
 4 | """
 5 | Utility Lambda function that can be used to update a Personalize campaign's minProvisionedTPS value
 6 | based on triggers such as CloudWatch event rules (i.e. cron) or application events. 
 7 | """
 8 | 
 9 | import json
10 | import boto3
11 | import os
12 | import json
13 | import logging
14 | 
15 | from aws_lambda_powertools import Logger
16 | 
17 | from common import (
18 |     extract_region,
19 |     get_client,
20 |     put_event
21 | )
22 | 
23 | logger = Logger()
24 |     
25 | @logger.inject_lambda_context(log_event=True)
26 | def lambda_handler(event, context):
27 |     ''' Updates the minProvisionedTPS value for an existing Personalize campaign '''
28 |     if event.get('detail'):
29 |         campaign_arn = event['detail']['CampaignARN']
30 |         min_tps = event['detail']['MinProvisionedTPS']
31 |         reason = event['detail']['Reason']
32 |     else:
33 |         campaign_arn = event['CampaignARN']
34 |         min_tps = event['MinProvisionedTPS']
35 |         reason = event.get('Reason')
36 | 
37 |     if min_tps < 1:
38 |         raise ValueError(f'"MinProvisionedTPS" must be >= 1')
39 |     
40 |     region = extract_region(campaign_arn)
41 |     if not region:
42 |         raise Exception('Region could not be extracted from campaign_arn')
43 |     
44 |     personalize = get_client(service_name = 'personalize', region_name = region)
45 | 
46 |     response = personalize.update_campaign(campaignArn = campaign_arn, minProvisionedTPS = min_tps)
47 |     
48 |     if logger.isEnabledFor(logging.DEBUG):
49 |         logger.debug(json.dumps(response, indent = 2, default = str))
50 | 
51 |     if not reason:
52 |         reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)'
53 | 
54 |     put_event(
55 |         detail_type = 'PersonalizeCampaignMinProvisionedTPSUpdated',
56 |         detail = json.dumps({
57 |             'CampaignARN': campaign_arn,
58 |             'NewMinProvisionedTPS': min_tps,
59 |             'Reason': reason
60 |         }),
61 |         resources = [ campaign_arn ]
62 |     )
63 | 
64 |     logger.info({
65 |         'campaignArn': campaign_arn,
66 |         'minProvisionedTPS': min_tps
67 |     })
68 |     
69 |     return f'Successfully initiated update of minProvisionedTPS to {min_tps} for campaign {campaign_arn}'


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/src/personalize_delete_campaign_function/personalize_delete_campaign.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: MIT-0
 3 | 
 4 | """
 5 | Lambda function that is used to delete a Personalize campaign based on prolonged idle time 
 6 | and according to configuration to automatically delete campaigns under these conditions.
 7 | """
 8 | 
 9 | import json
10 | import boto3
11 | import os
12 | import json
13 | import logging
14 | 
15 | from aws_lambda_powertools import Logger
16 | 
17 | from common import (
18 |     extract_region,
19 |     get_client,
20 |     put_event
21 | )
22 | 
23 | logger = Logger()
24 | 
25 | def delete_alarms_for_campaign(campaign_arn):
26 |     cw = get_client(service_name = 'cloudwatch', region_name = extract_region(campaign_arn))
27 | 
28 |     alarm_names_to_delete = set()
29 | 
30 |     alarms_paginator = cw.get_paginator('describe_alarms')
31 |     for alarms_page in alarms_paginator.paginate(AlarmNamePrefix = ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']):
32 |         for alarm in alarms_page['MetricAlarms']:
33 |             for dim in alarm['Dimensions']:
34 |                 if dim['Name'] == 'CampaignArn' and dim['Value'] == campaign_arn:
35 |                     tags_response = cw.list_tags_for_resource(ResourceARN = alarm['AlarmArn'])
36 | 
37 |                     for tag in tags_response['Tags']:
38 |                         if tag['Key'] == 'CreatedBy' and tag['Value'] == PROJECT_NAME:
39 |                             alarm_names_to_delete.add(alarm['AlarmName'])
40 |                             break
41 | 
42 |     if alarm_names_to_delete:
43 |         # FUTURE: max check of 100
44 |         logger.info('Deleting CloudWatch alarms for campaign %s: %s', campaign_arn, alarm_names_to_delete)
45 |         cw.delete_alarms(AlarmNames=list(alarm_names_to_delete))
46 |         alarms_deleted += len(alarm_names_to_delete)
47 |     else:
48 |         logger.info('No CloudWatch alarms to delete for campaign %s', campaign_arn)
49 | 
50 | @logger.inject_lambda_context(log_event=True)
51 | def lambda_handler(event, context):
52 |     ''' Initiates the delete of a Personalize campaign '''
53 |     if event.get('detail'):
54 |         campaign_arn = event['detail']['CampaignARN']
55 |         reason = event['detail']['Reason']
56 |     else:
57 |         campaign_arn = event['CampaignARN']
58 |         reason = event.get('Reason')
59 |     
60 |     region = extract_region(campaign_arn)
61 |     if not region:
62 |         raise Exception('Region could not be extracted from campaign_arn')
63 |     
64 |     personalize = get_client(service_name = 'personalize', region_name = region)
65 | 
66 |     response = personalize.delete_campaign(campaignArn = campaign_arn)
67 | 
68 |     if logger.isEnabledFor(logging.DEBUG):
69 |         logger.debug(json.dumps(response, indent = 2, default = str))
70 | 
71 |     if not reason:
72 |         reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)'
73 | 
74 |     put_event(
75 |         detail_type = 'PersonalizeCampaignDeleted',
76 |         detail = json.dumps({
77 |             'CampaignARN': campaign_arn,
78 |             'Reason': reason
79 |         }),
80 |         resources = [ campaign_arn ]
81 |     )
82 | 
83 |     put_event(
84 |         detail_type = 'BuildPersonalizeMonitorDashboard',
85 |         detail = json.dumps({
86 |             'CampaignARN': campaign_arn,
87 |             'Reason': reason
88 |         }),
89 |         resources = [ campaign_arn ]
90 |     )
91 | 
92 |     logger.info({
93 |         'campaignArn': campaign_arn
94 |     })
95 | 
96 |     delete_alarms_for_campaign(campaign_arn)
97 |     
98 |     return f'Successfully initiated delete of campaign {campaign_arn}'


--------------------------------------------------------------------------------
/src/dashboard_mgmt_function/dashboard-template.mustache:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"widgets": [{
  3 | 		"type": "metric",
  4 | 		"width": 4,
  5 | 		"height": 4,
  6 | 		"properties": {
  7 | 			"metrics": [
  8 | 				["{{namespace}}", "monitoredCampaignCount"]
  9 | 			],
 10 | 			"view": "singleValue",
 11 | 			"region": "{{current_region}}",
 12 | 			"title": "Campaigns Monitored",
 13 | 			"stat": "Average",
 14 | 			"period": 300
 15 | 		}
 16 | 	},
 17 | 	{
 18 | 		"type": "text",
 19 | 		"width": 20,
 20 | 		"height": 4,
 21 | 		"properties": {
 22 | 			"markdown": "\n## Amazon Personalize Monitor Dashboard\n*This dashboard and its widgets were created and managed via the [Personalize Monitor](https://github.com/aws-samples/amazon-personalize-monitor) application.*\n\nFor best practices on integrating with and operating [Amazon Personalize](https://aws.amazon.com/personalize/), please see our [Cheat Sheet](https://github.com/aws-samples/amazon-personalize-samples/blob/master/PersonalizeCheatSheet2.0.md).\n\nResources: [Service Documentation](https://docs.aws.amazon.com/personalize/latest/dg/what-is-personalize.html) | [Personalize Blog](https://aws.amazon.com/blogs/machine-learning/category/artificial-intelligence/amazon-personalize/) | [Samples on GitHub](https://github.com/aws-samples/amazon-personalize-samples)\n"
 23 | 		}
 24 | 	}
 25 | 	{{#dataset_groups}}
 26 | 	,{
 27 | 		"type": "text",
 28 | 		"width": 24,
 29 | 		"height": 1,
 30 | 		"properties": {
 31 | 			"markdown": "\n### Dataset Group: **{{name}}** ({{region}}) | [Manage](https://console.aws.amazon.com/personalize/home?region={{region}}#arn:aws:personalize:{{region}}:{{account_id}}:dataset-group${{name}}/campaigns)\n"
 32 | 		}
 33 | 	},
 34 | 	{
 35 | 		"type": "metric",
 36 | 		"width": 8,
 37 | 		"height": 8,
 38 | 		"properties": {
 39 | 			"metrics": [
 40 | 				{{#campaigns}}
 41 | 				["{{namespace}}", "minProvisionedTPS", "CampaignArn", "{{campaign_arn}}", {
 42 | 					"label": "{{name}} minProvisionedTPS"
 43 | 				}],
 44 | 				["{{namespace}}", "averageTPS", "CampaignArn", "{{campaign_arn}}", {
 45 | 					"label": "{{name}} averageTPS"
 46 | 				}]{{^last_campaign}}, {{/last_campaign}}
 47 | 				{{/campaigns}}
 48 | 			],
 49 | 			"region": "{{region}}",
 50 | 			"view": "timeSeries",
 51 | 			"stacked": false,
 52 | 			"stat": "Average",
 53 | 			"period": 300,
 54 | 			"title": "Actual vs Provisioned TPS",
 55 | 			"yAxis": {
 56 | 				"left": {
 57 | 					"label": "TPS",
 58 | 					"min": 0,
 59 | 					"showUnits": false
 60 | 				},
 61 | 				"right": {
 62 | 					"showUnits": true,
 63 | 					"label": ""
 64 | 				}
 65 | 			},
 66 | 			"annotations": {
 67 | 				"horizontal": [{
 68 | 					"label": "Lowest TPS Allowed",
 69 | 					"value": 1
 70 | 				}]
 71 | 			}
 72 | 		}
 73 | 	},
 74 | 	{
 75 | 		"type": "metric",
 76 | 		"width": 8,
 77 | 		"height": 8,
 78 | 		"properties": {
 79 | 			"view": "timeSeries",
 80 | 			"stacked": false,
 81 | 			"metrics": [
 82 | 				{{#campaigns}}
 83 | 				["{{namespace}}", "campaignUtilization", "CampaignArn", "{{campaign_arn}}", {
 84 | 					"label": "{{name}} campaignUtilization"
 85 | 				}]{{^last_campaign}}, {{/last_campaign}}
 86 | 				{{/campaigns}}
 87 | 			],
 88 | 			"region": "{{region}}",
 89 | 			"title": "Campaign Utilization"
 90 | 		}
 91 | 	},
 92 | 	{
 93 | 		"type": "metric",
 94 | 		"width": 8,
 95 | 		"height": 8,
 96 | 		"properties": {
 97 | 			"view": "timeSeries",
 98 | 			"stacked": false,
 99 | 			"metrics": [
100 | 				{{#campaigns}}
101 | 				["AWS/Personalize", "{{campaign_latency_metric_name}}", "CampaignArn", "{{campaign_arn}}", {
102 | 					"label": "{{name}} {{campaign_latency_metric_name}}"
103 | 				}]{{^last_campaign}}, {{/last_campaign}}
104 | 				{{/campaigns}}
105 | 			],
106 | 			"region": "{{region}}",
107 | 			"title": "Campaign Latency"
108 | 		}
109 | 	}
110 | 	{{/dataset_groups}}
111 | 	]
112 | }


--------------------------------------------------------------------------------
/src/personalize_delete_campaign_function/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Personalize Monitor - Delete Campaign Function
 2 | 
 3 | This Lambda function deletes a Personalize campaign. It is called as the target of an EventBridge rule that matches events with the `DeletePersonalizeCampaign` detail-type. The [personalize-monitor](../personalize_monitor_function/) function publishes this event when the `AutoDeleteIdleCampaigns` deployment parameter is `Yes` AND a monitored campaign has been idle more than `IdleCampaignThresholdHours` hours. Therefore, an idle campaign is one that has not had any `GetRecommendations` or `GetPersonalizedRanking` calls in the last `IdleCampaignThresholdHours` hours.
 4 | 
 5 | This function will also delete any CloudWatch alarms that were dynamically created by this application for the deleted campaign. Alarms can be created for idle campaigns and low utilization campaigns via the `AutoCreateIdleCampaignAlarms` and `AutoCreateCampaignUtilizationAlarms` deployment parameters.
 6 | 
 7 | ## How it works
 8 | 
 9 | The EventBridge event structure that triggers this function looks something like this:
10 | 
11 | ```javascript
12 | {
13 |     "source": "personalize.monitor",
14 |     "detail-type": "DeletePersonalizeCampaign",
15 |     "resources": [ CAMPAIGN_ARN_TO_DELETE ],
16 |     "detail": {
17 |         'CampaignARN': CAMPAIGN_ARN_TO_DELETE,
18 |         'CampaignUtilization': CURRENT_UTILIZATION,
19 |         'CampaignAgeHours': CAMPAIGN_AGE_IN_HOURS,
20 |         'IdleCampaignThresholdHours': CAMPAIGN_IDLE_HOURS,
21 |         'TotalRequestsDuringIdleThresholdHours': 0,
22 |         'Reason': DESCRIPTIVE_REASON_FOR_DELETE
23 |     }
24 | }
25 | ```
26 | 
27 | This function can also be invoked directly as part of your own operational process. The event you pass to the function only requires the campaign ARN as follows. 
28 | 
29 | ```javascript
30 | {
31 |     "CampaignARN": CAMPAIGN_ARN_TO_DELETE,
32 |     "Reason": OPTIONAL_DESCRIPTIVE_REASON_FOR_DELETE
33 | }
34 | ```
35 | 
36 | The Personalize [DeleteCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_DeleteCampaign.html) API is used to delete the campaign.
37 | 
38 | ## Published events
39 | 
40 | When the deletion of a campaign and any dynamically created CloudWatch alarms for the campaign have been successfully initiated by this function, two events are published to EventBridge. One event will trigger a notification to the SNS topic for this application and the other trigger the CloudWatch dashboard to be rebuilt.
41 | 
42 | ### Delete notification
43 | 
44 | The following event is published to EventBridge to signal that a campaign has been deleted.
45 | 
46 | ```javascript
47 | {
48 |     "source": "personalize.monitor",
49 |     "detail_type": "PersonalizeCampaignDeleted",
50 |     "resources": [ CAMPAIGN_ARN_DELETED ],
51 |     "detail": {
52 |         "CampaignARN": CAMPAIGN_ARN_DELETED,
53 |         "Reason": DESCRIPTIVE_REASON_FOR_DELETE
54 |     }
55 | }
56 | ```
57 | 
58 | An EventBridge rule is setup that will target an SNS topic with `NotificationEndpoint` as the subscriber. This is the email address you provided at deployment time. If you'd like, you can customize how these notification events are handled in the EventBridge and SNS consoles.
59 | 
60 | ### Rebuild CloudWatch dashboard
61 | 
62 | Since a monitored campaign has been deleted, the CloudWatch dashboard needs to be rebuilt so that the campaign is removed from the widgets. This is accomplished by publishing a `BuildPersonalizeMonitorDashboard` event that is processed by the [dashboard_mgmt](../dashboard_mgmt_function/) function.
63 | 
64 | ```javascript
65 | {
66 |     "source": "personalize.monitor",
67 |     "detail_type": "BuildPersonalizeMonitorDashboard",
68 |     "resources": [ CAMPAIGN_ARN_DELETED ],
69 |     "detail": {
70 |         "CampaignARN": CAMPAIGN_ARN_DELETED,
71 |         "Reason": DESCRIPTIVE_REASON_FOR_REBUILD
72 |     }
73 | }
74 | ```
75 | 


--------------------------------------------------------------------------------
/src/personalize_update_campaign_tps_function/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Personalize Monitor - Campaign Provisioned TPS Update Function
 2 | 
 3 | This Lambda function adjusts the `minProvisionedTPS` value for a Personalize campaign. It is called as the target of EventBridge rules for events emitted by the [personalize_monitor](../personalize_monitor_function/) function when configured to update campaigns based on actual TPS activity. You can also incorporate this function into your own operations to scale campaigns up and down. For example, if you know your campaign will experience a massive spike in requests at a certain time (i.e. flash sale) and you want to pre-warm your campaign capacity, you can create a [CloudWatch event](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/RunLambdaSchedule.html) to call this function 30 minutes before the expected spike in traffic to increase the `minProvisionedTPS` and then again after the traffic event to lower the `minProvisionedTPS`. Alternatively, if there are certain events that occur in your application that you know will generate a predictably higher or lower volume of requests than the current `minProvisionedTPS` **AND** Personalize's auto-scaling will not suffice, you can use this function as a trigger to adjust `minProvisionedTPS` accordingly.
 4 | 
 5 | ## How it works
 6 | 
 7 | The EventBridge event structure that triggers this function looks something like this:
 8 | 
 9 | ```javascript
10 | {
11 |     "source": "personalize.monitor",
12 |     "detail-type": "UpdatePersonalizeCampaignMinProvisionedTPS",
13 |     "resources": [ CAMPAIGN_ARN_TO_UPDATE ],
14 |     "detail": {
15 |         "CampaignARN": CAMPAIGN_ARN_TO_UPDATE,
16 |         "CampaignUtilization": CURRENT_UTILIZATION,
17 |         "CampaignAgeHours": CAMPAIGN_AGE_IN_HOURS,
18 |         "CurrentProvisionedTPS": CURRENT_MIN_PROVISIONED_TPS,
19 |         "MinProvisionedTPS": NEW_MIN_PROVISIONED_TPS,
20 |         "MinAverageTPS": MIN_AVERAGE_TPS_LAST_24_HOURS,
21 |         "MaxAverageTPS": MAX_AVERATE_TPS_LAST_24_HOURS,
22 |         "Datapoints": [ CW_METRIC_DATAPOINTS_LAST_24_HOURS ],
23 |         "Reason": DESCRIPTIVE_REASON_FOR_UPDATE
24 |     }
25 | }
26 | ```
27 | 
28 | This function can also be invoked directly as part of your own operational process. The event you pass to the function only requires the campaign ARN and new `minProvisionedTPS` as follows. 
29 | 
30 | ```javascript
31 | {
32 |     "CampaignARN": "CAMPAIGN_ARN_HERE",
33 |     "MinProvisionedTPS": NEW_MIN_PROVISIONED_TPS_HERE,
34 |     "Reason": DESCRIPTIVE_REASON_FOR_UPDATE
35 | }
36 | ```
37 | 
38 | The Personalize [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API is used to update the `minProvisionedTPS` value.
39 | 
40 | ## Published events
41 | 
42 | When an update of a campaign's `minProvisionedTPS` has been successfully initiated by this function, an event is published to EventBridge to trigger a notification.
43 | 
44 | > Since it can take several minutes for a campaign to redeploy after updating its `minProvisionedTPS`, you will receive the notification when the redeploy starts. The campaign will continue to respond to `GetRecommendations`/`GetPersonalizedRanking` API requests while it is redeploying. **Therefore, there will be no interruption of service.**
45 | 
46 | ### Update minProvisionedTPS notification
47 | 
48 | The following event is published to EventBridge to signal that an update to a campaign has been initiated.
49 | 
50 | ```javascript
51 | {
52 |     "source": "personalize.monitor",
53 |     "detail_type": "PersonalizeCampaignMinProvisionedTPSUpdated",
54 |     "resources": [ CAMPAIGN_ARN_UPDATED ],
55 |     "detail": {
56 |         "CampaignARN": CAMPAIGN_ARN_UPDATED,
57 |         "NewMinProvisionedTPS": NEW_TPS,
58 |         "Reason": DESCRIPTIVE_REASON_FOR_DELETE
59 |     }
60 | }
61 | ```
62 | 
63 | An EventBridge rule is setup that will target an SNS topic with `NotificationEndpoint` as the subscriber. This is the email address you provided at deployment time. If you'd like, you can customize how these notification events are handled or add your own targets in the EventBridge and SNS consoles.
64 | 


--------------------------------------------------------------------------------
/src/dashboard_mgmt_function/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Personalize Monitor - CloudWatch Dashboard Create/Update/Delete Function
 2 | 
 3 | The [dashboard_mgmt.py](./dashboard_mgmt.py) Lambda function is responsible for creating, updating/refreshing, and deleting the [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) for this application. It is called in the following contexts:
 4 | 
 5 | - As part of the CloudFormation deployment process for this application as a [custom resource](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/template-custom-resources.html) (create, update, delete).
 6 | - In response to the `BuildPersonalizeMonitorDashboard` CloudWatch event being handled. This event is published to the default [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html) event bus when a monitored campaign is automatically deleted so that the dashboard can be rebuilt. An EventBridge rule is used to trigger this function to be invoked when the event is received.
 7 | - At the top of every hour, triggered by a scheduled CloudWatch event. This ensures that any campaigns that are created or deleted (outside of this application) that meet the monitoring criteria are added to the dashboard.
 8 | 
 9 |  The dashboard will include line graph widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the Personalize campaigns you wish to monitor. Here is an example of a dashboard.
10 | 
11 | ![Personalize Monitor CloudWatch Dashboard](../../images/personalize-monitor-cloudwatch-dashboard.png)
12 | 
13 | ## How it works
14 | 
15 | The EventBridge event structure that triggers this function looks something like this:
16 | 
17 | ```javascript
18 | {
19 |     "source": "personalize.monitor",
20 |     "detail-type": "BuildPersonalizeMonitorDashboard",
21 |     "resources": [ CAMPAIGN_ARN_THAT_TRIGGERED ],
22 |     "detail": {
23 |         "Reason": DESCRIPTIVE_REASON_FOR_UPDATE
24 |     }
25 | }
26 | ```
27 | 
28 | This function can also be invoked directly as part of your own operational process. The `Reason` is optional and just used for logging. 
29 | 
30 | ```javascript
31 | {
32 |     "Reason": DESCRIPTIVE_REASON_FOR_UPDATE
33 | }
34 | ```
35 | 
36 | ### Create/Update
37 | 
38 | When called as part of this application's create or update deployment process or as a result of the `BuildPersonalizeMonitorDashboard`, the function first determines what Personalize campaigns should be monitored based on the CloudFormation template parameters you specify when you [installed](../README.md#installing-the-application) the application. The monitored campaigns are grouped by [dataset group](https://docs.aws.amazon.com/personalize/latest/dg/data-prep-ds-group.html) and placed in a dictionary that is passed to the python [chevron](https://github.com/noahmorrison/chevron) library to render the [dashboard template](./dashboard-template.mustache) file. The template uses the [mustache templating language](http://mustache.github.io/) to build the widgets.
39 | 
40 | Once the template is rendered as dashboard source (JSON), the dashboard source is used to create or update the CloudWatch dashboard by calling the [PutDashboard API](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutDashboard.html).
41 | 
42 | Therefore, if you want to change what campaigns are monitored, just re-deploy this application and the current dashboard will be overwritten with your campaign changes or wait for the dashboard to automatically update itself (subject to campaign monitoring configuration). **This also means that any manual changes you make to the Personalize Monitor dashboard will be lost.** If you want to add your own widgets to the dashboard or change the existing widgets, you can fork this repository, change the [dashboard-template.mustache](./dashboard-template.mustache) template file, and deploy into your AWS account.
43 | 
44 | ### Delete
45 | 
46 | When the CloudFormation stack is deleted for this application, this function will delete the dashboard.
47 | 
48 | ## Calling from your own code
49 | 
50 | You can trigger the CloudWatch dashboard to be rebuilt by publishing the `BuildPersonalizeMonitorDashboard` detail-type from own code. Here is an example in python.
51 | 
52 | ```python
53 | import boto3
54 | import json
55 | 
56 | event_bridge = boto3.client('events')
57 | 
58 | event_bridge.put_events(
59 |     Entries=[
60 |         {
61 |             'Source': 'personalize.monitor',
62 |             'DetailType': 'BuildPersonalizeMonitorDashboard',
63 |             'Detail': json.dumps({
64 |                 'Reason': 'Rebuild the dashboard because I said so'
65 |             })
66 |         }
67 |     ]
68 | )
69 | ```


--------------------------------------------------------------------------------
/src/dashboard_mgmt_function/dashboard_mgmt.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: MIT-0
  3 | 
  4 | """Manages create/update/delete of the Personalize Monitor CloudWatch dashboard
  5 | 
  6 | This function is called two ways:
  7 | 
  8 | 1. From CloudFormation when the application is deployed, updated, or deleted in an AWS 
  9 | account. When the resource is created, this function will create the Personalize 
 10 | Monitor Dashboard in CloudWatch populated with widgets for monitoring Personalize
 11 | resources configured as deployment parameters. 
 12 | 
 13 | When this resource is updated (i.e. redeployed), the dashboard will be rebuilt and 
 14 | updated/replaced.
 15 | 
 16 | When this resource is deleted, this function will delete the CloudWatch Dashboard.
 17 | 
 18 | 2. As the target of an EventBridge rule that signals that the dashboard should be 
 19 | rebuilt as a result of an event occurring. The event could be after a campaign has 
 20 | been deleted and therefore a good point to rebuild the dashboard. It could also 
 21 | be setup to periodically rebuild the dashboard on a schedule so it picks up new
 22 | campaigns too.
 23 | 
 24 | See the layer_dashboard Lambda Laye for details on how the dashboard is built.
 25 | """
 26 | 
 27 | import json
 28 | import os
 29 | import boto3
 30 | import chevron
 31 | 
 32 | from crhelper import CfnResource
 33 | from aws_lambda_powertools import Logger
 34 | from common import (
 35 |     extract_region,
 36 |     extract_account_id,
 37 |     get_client,
 38 |     get_configured_active_campaigns
 39 | )
 40 | 
 41 | logger = Logger()
 42 | helper = CfnResource()
 43 | 
 44 | cloudwatch = boto3.client('cloudwatch')
 45 | 
 46 | DASHBOARD_NAME = 'Personalize-Monitor'
 47 | 
 48 | def build_dashboard(event):
 49 |     # Will hold the data used to render the template.
 50 |     template_data = {}
 51 | 
 52 |     template_data['namespace'] = 'PersonalizeMonitor'
 53 |     template_data['current_region'] = os.environ['AWS_REGION']
 54 | 
 55 |     logger.debug('Loading active campaigns')
 56 | 
 57 |     campaigns = get_configured_active_campaigns(event)
 58 |     template_data['active_campaign_count'] = len(campaigns)
 59 | 
 60 |     # Group campaigns by dataset group so we can create DSG specific widgets in rows
 61 |     campaigns_by_dsg_arn = {}
 62 |     # Holds DSG info so we only have describe once per DSG
 63 |     dsgs_by_arn = {}
 64 | 
 65 |     for campaign in campaigns:
 66 |         logger.info('Campaign %s will be added to the dashboard', campaign['campaignArn'])
 67 | 
 68 |         campaign_region = extract_region(campaign['campaignArn'])
 69 | 
 70 |         personalize = get_client('personalize', campaign_region)
 71 | 
 72 |         response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn'])
 73 | 
 74 |         dsg_arn = response['solutionVersion']['datasetGroupArn']
 75 |         recipe_arn = response['solutionVersion']['recipeArn']
 76 | 
 77 |         dsg = dsgs_by_arn.get(dsg_arn)
 78 |         if not dsg:
 79 |             response = personalize.describe_dataset_group(datasetGroupArn = dsg_arn)
 80 |             dsg = response['datasetGroup']
 81 |             dsgs_by_arn[dsg_arn] = dsg
 82 | 
 83 |         campaign_datas = campaigns_by_dsg_arn.get(dsg_arn)
 84 |         if not campaign_datas:
 85 |             campaign_datas = []
 86 |             campaigns_by_dsg_arn[dsg_arn] = campaign_datas
 87 | 
 88 |         campaign_data = {
 89 |             'name': campaign['name'],
 90 |             'campaign_arn': campaign['campaignArn'],
 91 |             'region': campaign_region
 92 |         }
 93 | 
 94 |         if recipe_arn == 'arn:aws:personalize:::recipe/aws-personalized-ranking':
 95 |             campaign_data['campaign_latency_metric_name'] = 'GetPersonalizedRankingLatency'
 96 |         else:
 97 |             campaign_data['campaign_latency_metric_name'] = 'GetRecommendationsLatency'
 98 | 
 99 |         campaign_datas.append(campaign_data)
100 | 
101 |     dsgs_for_template = []
102 | 
103 |     for dsg_arn, campaign_datas in campaigns_by_dsg_arn.items():
104 |         dsg = dsgs_by_arn[dsg_arn]
105 | 
106 |         # Minor hack to know when we're on the last item in list when iterating in template.
107 |         campaign_datas[len(campaign_datas) - 1]['last_campaign'] = True
108 | 
109 |         dsgs_for_template.append({
110 |             'name': dsg['name'],
111 |             'region': extract_region(dsg_arn),
112 |             'account_id': extract_account_id(dsg_arn),
113 |             'campaigns': campaign_datas
114 |         })
115 | 
116 |     template_data['dataset_groups'] = dsgs_for_template
117 | 
118 |     # Render template and use as dashboard body.
119 |     with open('dashboard-template.mustache', 'r') as f:
120 |         dashboard = chevron.render(f, template_data)
121 | 
122 |         logger.debug(json.dumps(dashboard, indent = 2, default = str))
123 | 
124 |         logger.info('Adding/updating dashboard')
125 | 
126 |         cloudwatch.put_dashboard(
127 |             DashboardName = DASHBOARD_NAME,
128 |             DashboardBody = dashboard
129 |         )
130 | 
131 | def delete_dashboard():
132 |     logger.info('Deleting dashboard')
133 | 
134 |     cloudwatch.delete_dashboards(
135 |         DashboardNames = [ DASHBOARD_NAME ]
136 |     )
137 | 
138 | @helper.create
139 | @helper.update
140 | def create_or_update_resource(event, _):
141 |     build_dashboard(event)
142 | 
143 | @helper.delete
144 | def delete_resource(event, _):
145 |     delete_dashboard()
146 | 
147 | @logger.inject_lambda_context(log_event=True)
148 | def lambda_handler(event, context):
149 |     # If the event has a RequestType, we're being called by CFN as custom resource
150 |     if event.get('RequestType'):
151 |         logger.info('Called via CloudFormation as a custom resource; letting CfnResource route request')
152 |         helper(event, context)
153 |     else:
154 |         logger.info('Called via Invoke; assuming caller wants to build dashboard')
155 | 
156 |         if event.get('detail'):
157 |             reason = event['detail'].get('Reason')
158 |         else:
159 |             reason = event.get('Reason')
160 | 
161 |         if reason:
162 |             logger.info('Reason for dashboard build: %s', reason)
163 | 
164 |         build_dashboard(event)


--------------------------------------------------------------------------------
/src/personalize_monitor_function/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Personalize Monitor - Core Monitor Function
 2 | 
 3 | The [personalize_monitor.py](./personalize_monitor.py) Lambda is called every 5 minutes by a CloudWatch scheduled event rule to generate the CloudWatch metrics needed to populate the Personalize Monitor dashboard line graph widgets and to trigger the CloudWatch alarms for low campaign utilization and idle campaign detection (if configured). Also, if the `AutoDeleteIdleCampaigns` deployment parameter is `Yes` AND a monitored campaign has been idle more than `IdleCampaignThresholdHours` hours, this function will publish a `DeletePersonalizeCampaign` event to EventBridge that is handled by the [personalize_delete_campaign](../personalize_delete_campaign_function/) function.  An idle campaign is one that has not had any `GetRecommendations` or `GetPersonalizedRanking` calls in the last `IdleCampaignThresholdHours` hours. Finally, this function will adjust a campaign's `minProvisionedTPS` (down only) if the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter is `Yes`.
 4 | 
 5 | ## How it works
 6 | 
 7 | The function first determines what Personalize campaigns should be monitored based on the CloudFormation template parameters you specify when you [install](../README.md#installing-the-application) the application.
 8 | 
 9 | ## CloudWatch Metrics
10 | 
11 | The following custom CloudWatch metrics are generated by this function on 5 minute intervals. You can find these metrics in the AWS console under CloudWatch and then Metrics or you can query them using the CloudWatch API.
12 | 
13 | | Namespace | MetricName | Dimensions | Unit | Description |
14 | | --- | --- | --- | --- | --- |
15 | | PersonalizeMonitor | monitoredCampaignCount | | Count | Number of campaigns currently being monitored at interval |
16 | | PersonalizeMonitor | minProvisionedTPS | CampaignArn | Count/Second | `minProvisionedTPS` value for the campaign at interval |
17 | | PersonalizeMonitor | averageTPS | CampaignArn | Count/Second | Average TPS for the campaign at interval |
18 | | PersonalizeMonitor | campaignUtilization | CampaignArn | Percent | Utilization percentage of `averageTPS` vs `minProvisionedTPS` at interval |
19 | 
20 | ### How is averageTPS calculated?
21 | 
22 | The `averageTPS` metric value for each monitored campaign is calculated by first determining the number of requests made to the campaign during the 5 minute interval and dividing by 300 (the number of seconds in 5 minutes). The number of requests is pulled from the `GetRecommendations` or `GetPersonalizedRanking` metric (depending on the recipe for the campaign's solution) for the campaign from the `AWS/Personalize` namespace. This metric is automatically updated by Personalize itself.
23 | 
24 | ## CloudWatch Alarms (optional)
25 | 
26 | You can optionally have CloudWatch alarms dynamically created for monitored campaigns for low campaign utilization and idle campaigns.
27 | 
28 | ### Low Campaign Utilization Alarm
29 | 
30 | If you set the `AutoCreateCampaignUtilizationAlarms` CloudFormation template parameter to `Yes` when you installed this application, this function will automatically create a CloudWatch alarm for every campaign that it monitors. The alarm will trigger when the `campaignUtilization` custom metric described above drops below the `CampaignThresholdAlarmLowerBound` installation parameter for 9 out of 12 evaluation periods. Since the intervals are 5 minutes, that means that 9 of the 12 five minute evaluations over a 60 minute span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. The alarm will be created in the region where the campaign was created. An [SNS](https://aws.amazon.com/sns/) topic created by this application will be used as the alarm and ok actions and the `NotificationEndpoint` (email address) deployment parameter will be setup as a subscriber to the topic. **Be sure to confirm the subscription sent when this application is deployed by clicking on the one-time confirmation email sent by SNS.** 
31 | 
32 | The alarm will have its actions disabled when the `minProvisionedTPS` is 1 and enabled with `minProvisionedTPS` is > 1 so that notifications are only sent when utilization can be impacted by adjusting `minProvisionedTPS`. 
33 | 
34 | ### Idle Campaign Alarm
35 | 
36 | If you set the `AutoCreateIdleCampaignAlarms` CloudFormation template parameter to `Yes` when you installed this application, this function will automatically create a CloudWatch alarm for every monitored campaign that is idle for at least `IdleCampaignThresholdHours` hours. The actions for the alarm will be enabled only after the campaign has existed for `IdleCampaignThresholdHours` as well. The `GetRecommendations` or `GetPersonalizedRanking` (depending on the campaign's recipe) will be used to assess the campaign's idle state. The alarm will be created in the region where the campaign was created. An [SNS](https://aws.amazon.com/sns/) topic created by this application will be used as the alarm and ok actions and the `NotificationEndpoint` (email address) deployment parameter will be setup as a subscriber to the topic. **Be sure to confirm the subscription sent when this application is deployed by clicking on the one-time confirmation email sent by SNS.** 
37 | 
38 | ## Automatically adjusting campaign minProvisionedTPS (optional)
39 | 
40 | If the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter is `Yes`, this function will check the actual hourly TPS over the last 14 days against the currently configured `minProvisionedTPS` and look for opportunities to reduce the campaign's `minProvisionedTPS` to optimize utilization and reduce costs. It does this by checking the campaign's request volume for the previous 14 days on hourly intervals and finding the hour with the lowest average TPS (low watermark). If the low watermark average is less than the campaign's `minProvisionedTPS` AND the campaign is more than 1 day old, it will drop the `minProvisionedTPS` by 25%. This process will be repeated each hour until either the `minProvisionedTPS` meets the low watermark TPS or the `minProvisionedTPS` reaches 1 (the lowest allowed value). **This function will NOT increase the `minProvisionedTPS` for a campaign.** Instead it will rely on Personalize to auto-scale campaigns up and back down to `minProvisionedTPS` to meet demand. 
41 | 
42 | > Since it can take several minutes for a campaign to redeploy after updating its `minProvisionedTPS`, you will receive the notification when the redeploy starts. The campaign will continue to respond to `GetRecommendations`/`GetPersonalizedRanking` API requests while it is redeploying. There will be no interruption of service.
43 | 
44 | See the [personalize_update_campaign_tps](../personalize_update_campaign_tps_function/) function for details on the update function.
45 | 
46 | ## Automatically deleting idle campaigns (optional)
47 | 
48 | If the `AutoDeleteIdleCampaigns` deployment parameter is `Yes`, this function will perform additional checks once per hour for each monitored campaign to see if it has been idle for more than `IdleCampaignThresholdHours` hours. The purpose of this feature is to prevent abandoned campaigns from continuing to incur costs when they are no longer being used. Campaign checks are distributed across each hour in 10 minute blocks in an attempt to spread out the API calls needed to check and update campaigns.
49 | 
50 | To avoid too aggressively deleting campaigns, new campaigns that are not more than `IdleCampaignThresholdHours` hours old are exempt from being deleted. Similarly, if a campaign has been updated within `IdleCampaignThresholdHours`, it will also be exempt from being automatically deleted. The idea is that new or actively updated campaigns are likely not safe to delete.  
51 | 


--------------------------------------------------------------------------------
/src/layer/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: MIT-0
  3 | 
  4 | """
  5 | Lambda layer functions shared across Lambda functions in this application
  6 | """
  7 | 
  8 | import boto3
  9 | import os
 10 | import random
 11 | 
 12 | from botocore.exceptions import ClientError
 13 | from aws_lambda_powertools import Logger
 14 | from expiring_dict import ExpiringDict
 15 | 
 16 | logger = Logger(child=True)
 17 | 
 18 | _clients_by_region = {}
 19 | # Since the DescribeCampaign API easily throttles and we just need
 20 | # the minProvisionedTPS from the campaign, use a cache to help smooth
 21 | # out periods where we get throttled.
 22 | _campaign_cache = ExpiringDict(22 * 60)
 23 | 
 24 | PROJECT_NAME = 'PersonalizeMonitor'
 25 | ALARM_NAME_PREFIX = PROJECT_NAME + '-'
 26 | 
 27 | def put_event(detail_type, detail, resources = []):
 28 |     event_bridge = get_client('events')
 29 | 
 30 |     logger.info({
 31 |         'detail_type': detail_type,
 32 |         'detail': detail,
 33 |         'resources': resources
 34 |     })
 35 | 
 36 |     event_bridge.put_events(
 37 |         Entries=[
 38 |             {
 39 |                 'Source': 'personalize.monitor',
 40 |                 'Resources': resources,
 41 |                 'DetailType': detail_type,
 42 |                 'Detail': detail
 43 |             }
 44 |         ]
 45 |     )
 46 | 
 47 | def extract_region(arn):
 48 |     ''' Extracts region from an AWS ARN '''
 49 |     region = None
 50 |     elements = arn.split(':')
 51 |     if len(elements) > 3:
 52 |         region = elements[3]
 53 |         
 54 |     return region
 55 | 
 56 | def extract_account_id(arn):
 57 |     ''' Extracts account ID from an AWS ARN '''
 58 |     account_id = None
 59 |     elements = arn.split(':')
 60 |     if len(elements) > 4:
 61 |         account_id = elements[4]
 62 |         
 63 |     return account_id
 64 | 
 65 | def get_client(service_name, region_name = None):
 66 |     if not region_name:
 67 |         region_name = os.environ['AWS_REGION']
 68 | 
 69 |     ''' Returns boto3 client for a service and region '''
 70 |     clients_by_service = _clients_by_region.get(region_name)
 71 | 
 72 |     if not clients_by_service:
 73 |         clients_by_service = {}
 74 |         _clients_by_region[region_name] = clients_by_service
 75 | 
 76 |     client = clients_by_service.get(service_name)
 77 | 
 78 |     if not client:
 79 |         client = boto3.client(service_name = service_name, region_name = region_name)
 80 |         clients_by_service[service_name] = client
 81 | 
 82 |     return client
 83 | 
 84 | def determine_regions(event):
 85 |     ''' Determines regions from function event or environment '''
 86 |     # Check event first (list of region names)
 87 |     regions = None
 88 |     if event:
 89 |         regions = event.get('Regions')
 90 | 
 91 |     if not regions:
 92 |         # Check environment variable next for list of region names as CSV
 93 |         regions = os.environ.get('Regions')
 94 | 
 95 |     if not regions:
 96 |         # Lastly, use current region from environment.
 97 |         regions = os.environ['AWS_REGION']
 98 | 
 99 |     if regions and isinstance(regions, str):
100 |         regions = [exp.strip(' ') for exp in regions.split(',')]
101 | 
102 |     return regions
103 | 
104 | def determine_campaign_arns(event):
105 |     ''' Determines Personalize campaign ARNs based on function event or environment '''
106 | 
107 |     # Check event first (list of campaign ARNs)
108 |     arns = None
109 |     if event:
110 |         arns = event.get('CampaignARNs')
111 | 
112 |     if not arns:
113 |         # Check environment variable next for list of campaign ARNs as CSV
114 |         arns = os.environ.get('CampaignARNs')
115 | 
116 |     if not arns:
117 |         raise Exception('"CampaignARNs" expression required in event or environment')
118 | 
119 |     if isinstance(arns, str):
120 |         arns = [exp.strip(' ') for exp in arns.split(',')]
121 | 
122 |     logger.debug('CampaignARNs expression: %s', arns)
123 |     
124 |     # Look for magic value of "all" to mean all active campaigns in configured region(s)
125 |     if len(arns) == 1 and arns[0].lower() == 'all':
126 |         logger.debug('Retrieving ARNs for all active campaigns')
127 |         campaign_arns = []
128 | 
129 |         # Determine regions we need to consider
130 |         regions = determine_regions(event)
131 |         logger.debug('Regions to scan for active campaigns: %s', regions)
132 | 
133 |         for region in regions:
134 |             personalize = get_client(service_name = 'personalize', region_name = region)
135 |         
136 |             campaigns_for_region = 0
137 | 
138 |             campaigns_paginator = personalize.get_paginator('list_campaigns')
139 |             for campaigns_page in campaigns_paginator.paginate():
140 |                 for campaign in campaigns_page['campaigns']:
141 |                     campaign_arns.append(campaign['campaignArn'])
142 |                     campaigns_for_region += 1
143 | 
144 |             logger.debug('Region %s has %d campaigns', region, campaigns_for_region)
145 |     else:
146 |         campaign_arns = arns
147 |         
148 |     return campaign_arns
149 | 
150 | def get_configured_active_campaigns(event):
151 |     ''' Returns list of active campaigns as configured by function event and/or environment '''
152 |     campaign_arns = determine_campaign_arns(event)
153 | 
154 |     # Shuffle the list of arns so we don't try to describe campaigns in the same order each 
155 |     # time and potentially use cached campaign details for the same campaigns further down 
156 |     # the list due to rare but possible API throttling.
157 |     random.shuffle(campaign_arns)
158 | 
159 |     campaigns = []
160 | 
161 |     for campaign_arn in campaign_arns:
162 |         campaign_region = extract_region(campaign_arn)
163 |         personalize = get_client(service_name = 'personalize', region_name = campaign_region)
164 |         campaign = None
165 | 
166 |         try:
167 |             # Always try the DescribeCampaign API directly first.
168 |             campaign = personalize.describe_campaign(campaignArn = campaign_arn)['campaign']
169 |             _campaign_cache[campaign_arn] = campaign
170 |         except ClientError as e:
171 |             error_code = e.response['Error']['Code']
172 |             if error_code == 'ThrottlingException':
173 |                 logger.error('ThrottlingException trapped when calling DescribeCampaign API for %s', campaign_arn)
174 | 
175 |                 # Fallback to see if we have a cached Campaign to use instead.
176 |                 campaign = _campaign_cache.get(campaign_arn)
177 |                 if campaign:
178 |                     logger.warn('Using cached campaign object for %s', campaign_arn)
179 |                 else:
180 |                     logger.warn('Campaign %s NOT found found in cache; skipping this time', campaign_arn)
181 |             elif error_code == 'ResourceNotFoundException':
182 |                 # Campaign has been deleted; log and skip.
183 |                 logger.error('Campaign %s no longer exists; skipping', campaign_arn)
184 |             else:
185 |                 raise e
186 | 
187 |         if campaign:
188 |             if campaign['status'] == 'ACTIVE':
189 |                 latest_status = None
190 |                 if campaign.get('latestCampaignUpdate'):
191 |                     latest_status = campaign['latestCampaignUpdate']['status']
192 | 
193 |                 if not latest_status or (latest_status != 'DELETE PENDING' and latest_status != 'DELETE IN_PROGRESS'):
194 |                     campaigns.append(campaign)
195 |                 else:
196 |                     logger.info('Campaign %s latestCampaignUpdate.status is %s and cannot be monitored in this state; skipping', campaign_arn, latest_status)
197 |             else:
198 |                 logger.info('Campaign %s status is %s and cannot be monitored in this state; skipping', campaign_arn, campaign['status'])
199 | 
200 |     return campaigns


--------------------------------------------------------------------------------
/README-SAR.md:
--------------------------------------------------------------------------------
  1 | # Amazon Personalize Monitor
  2 | 
  3 | This project contains the source code and supporting files for deploying a serverless application that adds monitoring, alerting, and optimzation capabilities for [Amazon Personalize](https://aws.amazon.com/personalize/), an AI service from AWS that allows you to create custom ML recommenders based on your data. Highlights include:
  4 | 
  5 | - Generation of additional [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) to track the Average TPS, `minProvisionedTPS`, and Utilization of Personalize [campaign](https://docs.aws.amazon.com/personalize/latest/dg/campaigns.html) over time.
  6 | - [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) to alert you via SNS/email when campaign utilization drops below a configurable threshold (optional).
  7 | - [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) populated with graph widgets for Actual vs Provisioned TPS, Campaign Utilization, Campaign Latency, and the number of campaigns being monitored.
  8 | - Capable of monitoring campaigns across multiple regions in the same AWS account.
  9 | - Automatically delete campaigns that have been idle more than a configurable number of hours (optional).
 10 | - Automatically reduce the `minProvisionedTPS` for over-provisioned campaigns to optimize cost (optional).
 11 | 
 12 | ## Why is this important?
 13 | 
 14 | Once you create a solution and solution version based on your data, an Amazon Personalize campaign can be created that allows you to retrieve recommendations in real-time based on the solution version. This is typically how Personalize is integrated into your applications. When an application needs to display personalized recommendations to a user, a [GetRecommendations](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#recommendations) or [GetPersonalizedRanking](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#rankings) API call is made to a campaign to retrieve recommendations. Just like monitoring your own application components is important, monitoring your Personalize campaigns is also important and considered a best practice. This application is designed to help you do just that.
 15 | 
 16 | When you provision a campaign using the [CreateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_CreateCampaign.html) API, you must specify a value for `minProvisionedTPS`. This value specifies the requested _minimum_ provisioned transactions (calls) per second that Amazon Personalize will support for that campaign. As your actual request volume to a campaign approaches its `minProvisionedTPS`, Personalize will automatically provision additional resources to meet your request volume. Then when request volume drops, Personalize will automatically scale back down **no lower** than `minProvisionedTPS`. **Since you are billed based on the higher of actual TPS and `minProvisionedTPS`, it is therefore important to not over-provision your campaigns to optimize cost.** This also means that leaving a campaign idle (active but no longer in-use) will result in unnecessary charges. This application gives you the tools to visualize your campaign utilization, to be notified when there is an opportunity to tune your campaign provisioning, and even take action to reduce and eliminate over-provisioning.
 17 | 
 18 | > General best practice is to set `minProvisionedTPS` to `1`, or your low watermark for campaign recommendations requests, and let Personalize auto-scale campaign resources to meet actual demand.
 19 | 
 20 | See the Amazon Personalize [pricing page](https://aws.amazon.com/personalize/pricing/) for full details on costs.
 21 | 
 22 | ### CloudWatch Dashboard
 23 | 
 24 | When you deploy this application, a [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) is built with widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the campaigns you wish to monitor. The dashboard gives you critical visual information to assess how your campaigns are performing and being utilized. The data in these graphs can help you properly tune your campaign's `minProvisionedTPS`.
 25 | 
 26 | ![Personalize Monitor CloudWatch Dashboard](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-dashboard.png)
 27 | 
 28 | For more details on the CloudWatch dashboard created and maintained by this application, see the [dashboard_mgmt](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/dashboard_mgmt_function/) function page.
 29 | 
 30 | ### CloudWatch Alarms
 31 | 
 32 | At deployment time, you can optionally have this application automatically create [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) that will alert you when a monitored campaign's utilization drops below a threshold you define for two out of three evaluation periods. Since the intervals are 5 minutes, that means that two of the three 5 minute evaluations over a 15 minute span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. The alarms will be setup to alert you via email through an SNS topic. Once the alarms are setup, you can alternatively link them to any operations and messaging tools you already use (i.e. Slack, PagerDuty, etc).
 33 | 
 34 | ![Personalize Monitor CloudWatch Alarms](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-alarms.png)
 35 | 
 36 | For more details on the CloudWatch alarms created by this application, see the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function page.
 37 | 
 38 | ### CloudWatch Metrics
 39 | 
 40 | To support the CloudWatch dashboard and alarms described above, a few new custom [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) are added for the monitored campaigns. These metrics are populated by the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) Lambda function that is setup to run every 5 minutes in your account. You can find these metrics in CloudWatch under Metrics in the "PersonalizeMonitor" namespace.
 41 | 
 42 | ![Personalize Monitor CloudWatch Metrics](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-metrics.png)
 43 | 
 44 | For more details on the custom metrics created by this application, see the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function page.
 45 | 
 46 | ### Cost optimization (optional)
 47 | 
 48 | This application can be optionally configured to automatically perform cost optimization actions for your Amazon Personalize campaigns.
 49 | 
 50 | #### Idle campaigns
 51 | Idle campaigns are those that have been provisioned but are not receiving any `GetRecommendations`/`GetPersonalizedRanking` calls. Since costs are incurred while a campaign is active regardless of whether it receives any requests, detecting and eliminating these idle campaigns can be an important cost optimization activity. This can be particularly useful in non-production AWS accounts such as development and testing. See the `AutoDeleteIdleCampaigns` and `IdleCampaignThresholdHours` deployment parameters in the installation instructions below and the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function#automatically-deleting-idle-campaigns-optional) function for details.
 52 | 
 53 | #### Over-provisioned campaigns
 54 | 
 55 | Properly provisioning campaigns, as described earlier, is also an important cost optimization activity. This application can be configured to automatically reduce a campaign's `minProvisionedTPS` based on actual request volume. This will optimize a campaign's utilization when request volume is lower while relying on Personalize to auto-scale based on actual activity. See the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter below and the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function#automatically-adjusting-campaign-minprovisionedtps-optional) function for details.
 56 | 
 57 | ### Architecture
 58 | 
 59 | The following diagram depicts how the Lambda functions in this application work together using an event-driven approach built on [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html). The [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function is invoked every five minutes to generate CloudWatch metric data based on the monitored campaigns and create campaign utilization alarms (if configured). It also generates events which are published to EventBridge that trigger activities such as optimizing a campaign's `minProvisionedTPS`, deleting idle campaigns, updating the Personalize Monitor CloudWatch dashboard, and sending notifications. This approach allows you to more easily integrate these functions into your own operations by sending your own events, say, to trigger the dashboard to be rebuilt after you create a campaign or register your own targets to events generated by this application.
 60 | 
 61 | ![Personalize Monitor Architecture](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-architecture.png)
 62 | 
 63 | See the readme pages for each function for details on the events that they produce and consume.
 64 | 
 65 | ## Installing the application
 66 | 
 67 | ***IMPORTANT NOTE:** Deploying this application in your AWS account will create and consume AWS resources, which will cost money. For example, the CloudWatch dashboard, the Lambda function that collects additional monitoring metrics is run every 5 minutes, CloudWatch alarms, logging, and so on. Therefore, if after installing this application you choose not to use it as part of your monitoring strategy, be sure to follow the Uninstall instructions in the next section to avoid ongoing charges and to clean up all data.*
 68 | 
 69 | | Parameter | Description | Default |
 70 | | --- | --- | --- |
 71 | | CampaignARNs | Comma separated list of Personalize campaign ARNs to monitor or `all` to monitor all active campaigns. It is recommended to use `all` so that any new campaigns that are added after deployment will be automatically detected, monitored, and have alarms created (optional) | `all` |
 72 | | Regions | Comma separated list of AWS regions to monitor campaigns. Only applicable when `all` is used for `CampaignARNs`. Leaving this value blank will default to the region where this application is deployed (i.e. `AWS Region` parameter above). | |
 73 | | AutoCreateCampaignUtilizationAlarms | Whether to automatically create a utilization CloudWatch alarm for each monitored campaign. | `Yes` |
 74 | | CampaignThresholdAlarmLowerBound | Minimum threshold value (in percent) to enter alarm state for campaign utilization. This value is only relevant if `AutoCreateAlarms` is `Yes`. | `100` |
 75 | | AutoAdjustCampaignMinProvisionedTPS | Whether to automatically compare campaign request activity against the campaign's `minProvisionedTPS` to determine if `minProvisionedTPS` can be reduced to optimize utilization. | `Yes` |
 76 | | AutoCreateIdleCampaignAlarms | Whether to automatically create a idle detection CloudWatch alarm for each monitored campaign. | `Yes` |
 77 | | IdleCampaignThresholdHours | Number of hours that a campaign must be idle (i.e. no requests) before it is automatically deleted. `AutoDeleteIdleCampaigns` must be `Yes` for idle campaign deletion to occur. | `24` |
 78 | | AutoDeleteIdleCampaigns | Whether to automatically delete idle campaigns. An idle campaign is one that has not had any requests in `IdleCampaignThresholdHours` hours. | `No` |
 79 | | NotificationEndpoint | Email address to receive alarm and ok notifications and campaign delete and update events (optional). An [SNS](https://aws.amazon.com/sns/) topic is created and this email address will be added as a subscriber to that topic. You will receive a confirmation email for the SNS topic subscription so be sure to click the confirmation link in that email to ensure you receive notifications. | |
 80 | 
 81 | ## Uninstalling the application
 82 | 
 83 | To remove the resources created by this application in your AWS account, be sure to uninstall the application.
 84 | 
 85 | ## FAQs
 86 | 
 87 | ***Q: Can I use this application to determine my accumulated inference charges during the month?***
 88 | 
 89 | ***A:*** No! Although the `actualTPS` and `minProvisionedTPS` custom metrics generated by this application may be used to calculate an approximation of your accumulated inference charges, it should **never** be used as a substitute or proxy for actual Personalize inference costs. Always consult your AWS Billing Dashboard for actual service charges.
 90 | 
 91 | ***Q: What is an ideal campaign utilization percentage? Is it okay if my campaign utilization is over 100%?***
 92 | 
 93 | ***A:*** The campaign utilization metric is a measure of your actual campaign usage compared against the `minProvisionedTPS` for the campaign. Any utilization value >= 100% is ideal since that means you are not over-provisioning, and therefore not over-paying, for campaign resources. You're letting Personalize handle the scaling in/out of the campaign. Anytime your utilization is below 100%, more resources are provisioned than are needed to satisfy the volume of requests at that time.
 94 | 
 95 | ***Q: How can I tell if Personalize is scaling out fast enough?***
 96 | 
 97 | ***A:*** Compare the "Actual vs Provisioned TPS" graph to the "Campaign Latency" graph on the Personalize Monitor CloudWatch dashboard. When your Actual TPS increases/spikes for a campaign, does the latency for the same campaign at the same time stay consistent? If so, this tells you that Personalize is maintaining response time as request volume increases and therefore scaling fast enough to meet demand. However, if latency increases significantly and to an unacceptable level for your application, this is an indication that Personalize may not be scaling fast enough. See the answer to the following question for some options.
 98 | 
 99 | ***Q: My workload is very spikey and Personalize is not scaling fast enough. What can I do?***
100 | 
101 | ***A:*** First, be sure to confirm that it is Personalize that is not scaling fast enough by reviewing the answer above. If the spikes are predictable or cyclical, you can pre-warm capacity in your campaign ahead of time by adjusting the `minProvisionedTPS` using the [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API and then dropping it back down after the traffic subsides. For example, increase capacity 30 minutes before a flash sale or marketing campaign is launched that brings a temporary surge in traffic. This can be done manually using the AWS console or automated by using [CloudWatch events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/WhatIsCloudWatchEvents.html) based on a schedule or triggered based on an event in your application. The [personalize_update_campaign_tps](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_update_campaign_tps_function/) function that is deployed with this application can be used as the target for CloudWatch events or you can publish an `UpdatePersonalizeCampaignMinProvisionedTPS` event to EventBridge. If spikes in your workload are not predictable or known ahead of time, determining the optimal `minProvisionedTPS` to balance consistent latency vs cost is the best option. The metrics and dashboard graphs in this application can help you determine this value.
102 | 
103 | ***Q: After deploying this application in my AWS account, I created some new Personalize campaigns that I also want to monitor. How can I add them to be monitored and have them appear on my dashboard? Also, what about monitoried campaigns that I delete?***
104 | 
105 | ***A:*** If you specified `all` for the `CampaignARNs` deployment parameter (see installation instructions above), any new campaigns you create will be automatically monitored and alarms created (if `AutoCreateAlarms` was set to `Yes`) when the campaigns become active. Likewise, any campaigns that are deleted will no longer be monitored. If you want this application to monitor campaigns across multiple regions, be sure to specify the region names in the `Regions` deployment parameter. Note that this only applies when `CampaignARNs` is set to `all`. The CloudWatch dashboard will be automatically rebuilt ever hour to add new campaigns and drop deleted campaigns. You can also trigger the dashboard to be rebuilt by publishing a `BuildPersonalizeMonitorDashboard` event to the default EventBridge event bus (see [dashboard_mgmt_function](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/dashboard_mgmt_function/)).
106 | 
107 | ## Reporting issues
108 | 
109 | If you encounter a bug, please create a new issue with as much detail as possible and steps for reproducing the bug. Similarly, if you have an idea for an improvement, please add an issue as well. Pull requests are also welcome! See the [Contributing Guidelines](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/CONTRIBUTING.md) for more details.
110 | 
111 | ## License summary
112 | 
113 | This sample code is made available under a modified MIT license. See the LICENSE file.
114 | 


--------------------------------------------------------------------------------
/template.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Transform: AWS::Serverless-2016-10-31
  3 | Description: >
  4 |   Personalize monitoring tools including CloudWatch metrics, alarms, and dashboard; optional automated cost optimization
  5 | 
  6 | Metadata:
  7 |   AWS::ServerlessRepo::Application:
  8 |     Name: Amazon-Personalize-Monitor
  9 |     Description: >
 10 |       Creates a CloudWatch dashboard for monitoring the utilization of Amazon Personalize
 11 |       campaigns, creates CloudWatch alarms based on a user-defined threshold, and
 12 |       includes automated cost optimization actions.
 13 |     Author: AWS Applied AI - Personalize
 14 |     SpdxLicenseId: MIT-0
 15 |     LicenseUrl: LICENSE
 16 |     ReadmeUrl: README-SAR.md
 17 |     Labels: ['Personalize', 'CloudWatch', 'Monitoring']
 18 |     HomePageUrl: https://github.com/aws-samples/amazon-personalize-monitor
 19 |     SemanticVersion: 1.0.2
 20 |     SourceCodeUrl: https://github.com/aws-samples/amazon-personalize-monitor
 21 | 
 22 |   AWS::CloudFormation::Interface:
 23 |     ParameterGroups:
 24 |       - Label:
 25 |           default: "Amazon Personalize campaigns to monitor"
 26 |         Parameters:
 27 |           - CampaignARNs
 28 |           - Regions
 29 |       - Label:
 30 |           default: "CloudWatch alarm configuration"
 31 |         Parameters:
 32 |           - AutoCreateCampaignUtilizationAlarms
 33 |           - CampaignThresholdAlarmLowerBound
 34 |           - AutoCreateIdleCampaignAlarms
 35 |           - IdleCampaignThresholdHours
 36 |       - Label:
 37 |           default: "Cost optimization actions"
 38 |         Parameters:
 39 |           - AutoAdjustCampaignMinProvisionedTPS
 40 |           - AutoDeleteIdleCampaigns
 41 |       - Label:
 42 |           default: "Notifications"
 43 |         Parameters:
 44 |           - NotificationEndpoint
 45 |     ParameterLabels:
 46 |       CampaignARNs:
 47 |         default: "Personalize campaign ARNs to monitor"
 48 |       Regions:
 49 |         default: "AWS regions to monitor"
 50 |       AutoCreateCampaignUtilizationAlarms:
 51 |         default: "Automatically create campaign utilization CloudWatch alarms?"
 52 |       CampaignThresholdAlarmLowerBound:
 53 |         default: "Campaign utilization alarm lower bound threshold"
 54 |       AutoCreateIdleCampaignAlarms:
 55 |         default: "Automatically create idle campaign CloudWatch alarms?"
 56 |       IdleCampaignThresholdHours:
 57 |         default: "Number of hours without requests to be considered idle"
 58 |       AutoDeleteIdleCampaigns:
 59 |         default: "Automatically delete idle campaigns in idle alarm state?"
 60 |       AutoAdjustCampaignMinProvisionedTPS:
 61 |         default: "Automatically adjust/lower minProvisionedTPS for campaigns in utilization alarm state"
 62 |       NotificationEndpoint:
 63 |         default: "Email address to receive notifications"
 64 | 
 65 | Parameters:
 66 |   CampaignARNs:
 67 |     Type: String
 68 |     Description: >
 69 |       Comma separated list of Amazon Personalize Campaign ARNs to monitor or 'all' to dynamically monitor all campaigns.
 70 |     Default: 'all'
 71 | 
 72 |   Regions:
 73 |     Type: String
 74 |     Description: >
 75 |       Comma separated list of AWS region names. When using 'all' for CampaignARNs, this parameter can be used
 76 |       to control the region(s) where the Personalize Monitor looks for active Personalize Campaigns. When not specified,
 77 |       the region where you deploy this application will be used.
 78 | 
 79 |   AutoCreateCampaignUtilizationAlarms:
 80 |     Type: String
 81 |     Description: >
 82 |       Whether to automatically create CloudWatch alarms for campaign utilization for monitored campaigns. Valid values: Yes/No.
 83 |     AllowedValues:
 84 |       - 'Yes'
 85 |       - 'No'
 86 |     Default: 'Yes'
 87 | 
 88 |   CampaignThresholdAlarmLowerBound:
 89 |     Type: Number
 90 |     Description: >
 91 |       Campaign utilization alarm threshold value (in percent). When a monitored campaign's utilization falls below this value,
 92 |       the alarm state will be set to ALARM. Valid values: 0-1000 (integer).
 93 |     MinValue: 0
 94 |     MaxValue: 1000
 95 |     Default: 100
 96 | 
 97 |   AutoAdjustCampaignMinProvisionedTPS:
 98 |     Type: String
 99 |     Description: >
100 |       Whether to automatically adjust minProvisionedTPS down to lowest average TPS over rolling 24 hour window. The
101 |       minProvisionedTPS will never be increased. Valid values: Yes/No.
102 |     AllowedValues:
103 |       - 'Yes'
104 |       - 'No'
105 |     Default: 'Yes'
106 | 
107 |   AutoCreateIdleCampaignAlarms:
108 |     Type: String
109 |     Description: >
110 |       Whether to automatically create CloudWatch alarms for detecting idle campaigns. Valid values: Yes/No.
111 |     AllowedValues:
112 |       - 'Yes'
113 |       - 'No'
114 |     Default: 'Yes'
115 | 
116 |   IdleCampaignThresholdHours:
117 |     Type: Number
118 |     Description: >
119 |       Number of consecutive idle hours before a campaign is automatically deleted only if AutoDeleteIdleCampaigns is Yes. Valid values: 2-48 (integer).
120 |     MinValue: 2
121 |     MaxValue: 48
122 |     Default: 24
123 | 
124 |   AutoDeleteIdleCampaigns:
125 |     Type: String
126 |     Description: >
127 |       Whether to automatically delete campaigns that have been idle for IdleCampaignThresholdHours consecutive hours. Valid values: Yes/No.
128 |     AllowedValues:
129 |       - 'Yes'
130 |       - 'No'
131 |     Default: 'No'
132 | 
133 |   NotificationEndpoint:
134 |     Type: String
135 |     Description: >
136 |       Email address to receive CloudWatch alarm and other monitoring notifications.
137 | 
138 | Globals:
139 |   Function:
140 |     Timeout: 5
141 |     Runtime: python3.8
142 | 
143 | Resources:
144 |   NotificationsTopic:
145 |     Type: AWS::SNS::Topic
146 |     Properties:
147 |       DisplayName: 'Personalize Monitor Notifications'
148 |       Subscription:
149 |         - Endpoint: !Ref NotificationEndpoint
150 |           Protocol: email
151 |       Tags:
152 |         - Key: 'CreatedBy'
153 |           Value: 'PersonalizeMonitor'
154 |       TopicName: PersonalizeMonitorNotifications
155 | 
156 |   NotificationsTopicPolicy:
157 |     Type: AWS::SNS::TopicPolicy
158 |     Properties:
159 |       PolicyDocument:
160 |         Statement:
161 |           - Sid: PublishPolicy
162 |             Effect: Allow
163 |             Principal:
164 |               Service:
165 |                 - cloudwatch.amazonaws.com
166 |                 - events.amazonaws.com
167 |             Action: 'sns:Publish'
168 |             Resource: !Ref NotificationsTopic
169 |       Topics:
170 |         - !Ref NotificationsTopic
171 | 
172 |   NotificationsRule:
173 |     Type: AWS::Events::Rule
174 |     Properties:
175 |       Description: Routes Personalize Monitor notifications to notification SNS topic
176 |       EventPattern:
177 |         source:
178 |           - personalize.monitor
179 |         detail-type:
180 |           - PersonalizeCampaignMinProvisionedTPSUpdated
181 |           - PersonalizeCampaignDeleted
182 |       State: ENABLED
183 |       Targets:
184 |         - Arn: !Ref NotificationsTopic
185 |           Id: PersonalizeMonitorNotificationsId
186 |           InputTransformer:
187 |             InputPathsMap:
188 |               reason: "$.detail.Reason"
189 |               resources: "$.resources"
190 |               type: "$.detail-type"
191 |             InputTemplate: |
192 |               "Amazon Personalize monitor notification:"
193 |               ""
194 |               "Message type: <type>"
195 |               "Resource(s): <resources>"
196 |               "Reason: <reason>"
197 | 
198 |   CommonLayer:
199 |     Type: AWS::Serverless::LayerVersion
200 |     Properties:
201 |       ContentUri: src/layer
202 |       CompatibleRuntimes:
203 |         - python3.8
204 |     Metadata:
205 |       BuildMethod: python3.8
206 | 
207 |   MonitorFunction:
208 |     Type: AWS::Serverless::Function
209 |     Properties:
210 |       Description: Amazon Personalize monitor function that updates custom CloudWatch metrics and monitors campaign utilization every 5 minutes
211 |       Timeout: 30
212 |       CodeUri: src/personalize_monitor_function
213 |       Handler: personalize_monitor.lambda_handler
214 |       Layers:
215 |         - !Ref CommonLayer
216 |       Policies:
217 |         - Statement:
218 |           - Sid: PersonalizePolicy
219 |             Effect: Allow
220 |             Action:
221 |               - personalize:DescribeCampaign
222 |               - personalize:DescribeSolutionVersion
223 |               - personalize:ListCampaigns
224 |             Resource: '*'
225 |           - Sid: CloudWatchPolicy
226 |             Effect: Allow
227 |             Action:
228 |               - cloudwatch:DescribeAlarmsForMetric
229 |               - cloudwatch:DisableAlarmActions
230 |               - cloudwatch:EnableAlarmActions
231 |               - cloudwatch:GetMetricData
232 |               - cloudwatch:PutMetricAlarm
233 |               - cloudwatch:PutMetricData
234 |             Resource: '*'
235 |           - Sid: EventBridgePolicy
236 |             Effect: Allow
237 |             Action:
238 |               - events:PutEvents
239 |             Resource: '*'
240 |       Events:
241 |         ScheduledEvent:
242 |           Type: Schedule
243 |           Properties:
244 |             Description: Triggers primary Personalize Monitor monitoring logic
245 |             Schedule: cron(0/5 * * * ? *)
246 |             Enabled: True
247 |       Environment:
248 |         Variables:
249 |           CampaignARNs: !Ref CampaignARNs
250 |           Regions: !Ref Regions
251 |           NotificationsTopic: !Ref NotificationsTopic
252 |           AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms
253 |           CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound
254 |           AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms
255 |           IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours
256 |           AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns
257 |           AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS
258 | 
259 |   DashboardManagementFunction:
260 |     Type: AWS::Serverless::Function
261 |     Properties:
262 |       Description: Amazon Personalize monitor function that updates the CloudWatch dashboard hourly and when campaigns are added/deleted
263 |       Timeout: 15
264 |       CodeUri: src/dashboard_mgmt_function
265 |       Handler: dashboard_mgmt.lambda_handler
266 |       AutoPublishAlias: live
267 |       Layers:
268 |         - !Ref CommonLayer
269 |       Policies:
270 |         - Statement:
271 |           - Sid: PersonalizePolicy
272 |             Effect: Allow
273 |             Action:
274 |               - personalize:DescribeCampaign
275 |               - personalize:DescribeDatasetGroup
276 |               - personalize:DescribeSolutionVersion
277 |               - personalize:ListCampaigns
278 |             Resource: '*'
279 |           - Sid: DashboardPolicy
280 |             Effect: Allow
281 |             Action:
282 |               - cloudwatch:DeleteDashboards
283 |               - cloudwatch:PutDashboard
284 |             Resource: '*'
285 |       Environment:
286 |         Variables:
287 |           CampaignARNs: !Ref CampaignARNs
288 |           Regions: !Ref Regions
289 |       Events:
290 |         EBRule:
291 |           Type: EventBridgeRule
292 |           Properties:
293 |             Pattern:
294 |               source:
295 |                 - personalize.monitor
296 |               detail-type:
297 |                 - BuildPersonalizeMonitorDashboard
298 |         ScheduledEvent:
299 |           Type: Schedule
300 |           Properties:
301 |             Description: Hourly rebuild of Personalize Monitor CloudWatch dashboard
302 |             Schedule: cron(3 * * * ? *)
303 |             Enabled: True
304 | 
305 |   DeployDashboardCustomResource:
306 |     Type: Custom::DashboardCreate
307 |     Properties:
308 |       ServiceToken: !GetAtt DashboardManagementFunction.Arn
309 |       CampaignARNs: !Ref CampaignARNs
310 |       Regions: !Ref Regions
311 |       NotificationsTopic: !Ref NotificationsTopic
312 |       AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms
313 |       CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound
314 |       AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms
315 |       IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours
316 |       AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns
317 |       AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS
318 | 
319 |   UpdateCampaignTPSFunction:
320 |     Type: AWS::Serverless::Function
321 |     Properties:
322 |       Description: Amazon Personalize monitor function that updates the minProvisionedTPS for a campaign in response to an event
323 |       CodeUri: src/personalize_update_campaign_tps_function
324 |       Handler: personalize_update_campaign_tps.lambda_handler
325 |       Layers:
326 |         - !Ref CommonLayer
327 |       Policies:
328 |         - Statement:
329 |           - Sid: PersonalizePolicy
330 |             Effect: Allow
331 |             Action:
332 |               - personalize:UpdateCampaign
333 |             Resource: '*'
334 |           - Sid: EventBridgePolicy
335 |             Effect: Allow
336 |             Action:
337 |               - events:PutEvents
338 |             Resource: '*'
339 |       Events:
340 |         EBRule:
341 |           Type: EventBridgeRule
342 |           Properties:
343 |             Pattern:
344 |               source:
345 |                 - personalize.monitor
346 |               detail-type:
347 |                 - UpdatePersonalizeCampaignMinProvisionedTPS
348 | 
349 |   DeleteCampaignFunction:
350 |     Type: AWS::Serverless::Function
351 |     Properties:
352 |       Description: Amazon Personalize monitor function that deletes a campaign in response to an event
353 |       CodeUri: src/personalize_delete_campaign_function
354 |       Handler: personalize_delete_campaign.lambda_handler
355 |       Layers:
356 |         - !Ref CommonLayer
357 |       Policies:
358 |         - Statement:
359 |           - Sid: PersonalizePolicy
360 |             Effect: Allow
361 |             Action:
362 |               - personalize:DeleteCampaign
363 |             Resource: '*'
364 |           - Sid: EventBridgePolicy
365 |             Effect: Allow
366 |             Action:
367 |               - events:PutEvents
368 |             Resource: '*'
369 |           - Sid: CloudWatchFindAlarmsPolicy
370 |             Effect: Allow
371 |             Action:
372 |               - cloudwatch:DescribeAlarms
373 |               - cloudwatch:ListTagsForResource
374 |             Resource: '*'
375 |           - Sid: CloudWatchDeletePolicy
376 |             Effect: Allow
377 |             Action:
378 |               - cloudwatch:DeleteAlarms
379 |             Resource: !Sub 'arn:aws:cloudwatch:*:${AWS::AccountId}:alarm:PersonalizeMonitor-*'
380 |       Events:
381 |         EBCustomRule:
382 |           Type: EventBridgeRule
383 |           Properties:
384 |             Pattern:
385 |               source:
386 |                 - personalize.monitor
387 |               detail-type:
388 |                 - DeletePersonalizeCampaign
389 | 
390 |   CleanupFunction:
391 |     Type: AWS::Serverless::Function
392 |     Properties:
393 |       Description: Amazon Personalize monitor custom resource function that cleans up directly created resources when the application is deleted
394 |       Timeout: 15
395 |       CodeUri: src/cleanup_resources_function
396 |       Handler: cleanup_resources.lambda_handler
397 |       AutoPublishAlias: live
398 |       Layers:
399 |         - !Ref CommonLayer
400 |       Policies:
401 |         - Statement:
402 |           - Sid: PersonalizePolicy
403 |             Effect: Allow
404 |             Action:
405 |               - personalize:ListCampaigns
406 |             Resource: '*'
407 |           - Sid: CloudWatchFindAlarmsPolicy
408 |             Effect: Allow
409 |             Action:
410 |               - cloudwatch:DescribeAlarms
411 |               - cloudwatch:ListTagsForResource
412 |             Resource: '*'
413 |           - Sid: CloudWatchDeletePolicy
414 |             Effect: Allow
415 |             Action:
416 |               - cloudwatch:DeleteAlarms
417 |             Resource: !Sub 'arn:aws:cloudwatch:*:${AWS::AccountId}:alarm:PersonalizeMonitor-*'
418 |       Environment:
419 |         Variables:
420 |           CampaignARNs: !Ref CampaignARNs
421 |           Regions: !Ref Regions
422 | 
423 |   CleanupCustomResource:
424 |     Type: Custom::Cleanup
425 |     Properties:
426 |       ServiceToken: !GetAtt CleanupFunction.Arn
427 |       CampaignARNs: !Ref CampaignARNs
428 |       Regions: !Ref Regions
429 |       NotificationsTopic: !Ref NotificationsTopic
430 |       AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms
431 |       CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound
432 |       AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms
433 |       IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours
434 |       AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns
435 |       AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS
436 | 
437 | Outputs:
438 |   MonitorFunction:
439 |     Description: "Personalize monitor Function ARN"
440 |     Value: !GetAtt MonitorFunction.Arn
441 | 
442 |   DashboardManagementFunction:
443 |     Description: "CloudWatch Dashboard Management Function ARN"
444 |     Value: !GetAtt DashboardManagementFunction.Arn
445 | 
446 |   UpdateCampaignTPSFunction:
447 |     Description: "Update Personalize Campaign minProvisionedTPS Function ARN"
448 |     Value: !GetAtt UpdateCampaignTPSFunction.Arn
449 | 
450 |   DeleteCampaignFunction:
451 |     Description: "Delete Personalize Campaign Function ARN"
452 |     Value: !GetAtt DeleteCampaignFunction.Arn
453 | 
454 |   NotificationsTopic:
455 |     Description: "Notification SNS Topic ARN"
456 |     Value: !Ref NotificationsTopic
457 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Amazon Personalize Monitor
  2 | 
  3 | <!-- vscode-markdown-toc -->
  4 | * [Why is this important?](#Whyisthisimportant)
  5 | * [Features](#Features)
  6 | 	* [CloudWatch dashboard](#CloudWatchdashboard)
  7 | 	* [CloudWatch alarms](#CloudWatchalarms)
  8 | 	* [CloudWatch metrics](#CloudWatchmetrics)
  9 | 	* [Cost optimization (optional)](#Costoptimizationoptional)
 10 | 		* [Idle campaigns](#Idlecampaigns)
 11 | 		* [Over-provisioned campaigns](#Over-provisionedcampaigns)
 12 | * [Architecture](#Architecture)
 13 | * [Installing the application](#Installingtheapplication)
 14 | 	* [Option 1 - Install from Serverless Application Repository](#Option1-InstallfromServerlessApplicationRepository)
 15 | 	* [Option 2 - Install using Serverless Application Model](#Option2-InstallusingServerlessApplicationModel)
 16 | 	* [Application settings/parameters](#Applicationsettingsparameters)
 17 | * [Uninstalling the application](#Uninstallingtheapplication)
 18 | * [FAQs](#FAQs)
 19 | * [Reporting issues](#Reportingissues)
 20 | * [License summary](#Licensesummary)
 21 | 
 22 | <!-- vscode-markdown-toc-config
 23 | 	numbering=false
 24 | 	autoSave=true
 25 | 	/vscode-markdown-toc-config -->
 26 | <!-- /vscode-markdown-toc -->
 27 | 
 28 | This project contains the source code and supporting files for deploying a serverless application that adds monitoring, alerting, and optimzation capabilities for [Amazon Personalize](https://aws.amazon.com/personalize/), an AI service from AWS that allows you to create custom ML recommenders based on your data. Highlights include:
 29 | 
 30 | - Generation of additional [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) to track the Average TPS, `minProvisionedTPS`, and Utilization of Personalize [campaigns](https://docs.aws.amazon.com/personalize/latest/dg/campaigns.html) over time.
 31 | - [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) to alert you via SNS/email when campaign utilization drops below a configurable threshold or has been idle for a configurable length of time (optional).
 32 | - [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) populated with graph widgets for Actual vs Provisioned TPS, Campaign Utilization, Campaign Latency, and the number of campaigns being monitored.
 33 | - Capable of monitoring campaigns across multiple regions in the same AWS account.
 34 | - Automatically delete campaigns that have been idle more than a configurable number of hours (optional).
 35 | - Automatically reduce the `minProvisionedTPS` for over-provisioned campaigns to optimize cost (optional).
 36 | 
 37 | ## <a name='Whyisthisimportant'></a>Why is this important?
 38 | 
 39 | Once you create a solution and solution version based on your data, an Amazon Personalize campaign can be created that allows you to retrieve recommendations in real-time based on the solution version. This is typically how Personalize is integrated into your applications. When an application needs to display personalized recommendations to a user, a [GetRecommendations](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#recommendations) or [GetPersonalizedRanking](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#rankings) API call is made to a campaign to retrieve recommendations. Just like monitoring your own application components is important, monitoring your Personalize campaigns is also important and considered a best practice. This application is designed to help you do just that.
 40 | 
 41 | When you provision a campaign using the [CreateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_CreateCampaign.html) API, you must specify a value for `minProvisionedTPS`. This value specifies the requested _minimum_ provisioned transactions (calls) per second that Amazon Personalize will support for that campaign. As your actual request volume to a campaign approaches its `minProvisionedTPS`, Personalize will automatically provision additional resources to meet your request volume. Then when request volume drops, Personalize will automatically scale back down **no lower** than `minProvisionedTPS`. **Since you are billed based on the higher of actual TPS and `minProvisionedTPS`, it is therefore important to not over-provision your campaigns to optimize cost.** This also means that leaving a campaign idle (active but no longer in-use) will result in unnecessary charges. This application gives you the tools to visualize your campaign utilization, to be notified when there is an opportunity to tune your campaign provisioning, and even take action to reduce and eliminate over-provisioning.
 42 | 
 43 | > General best practice is to set `minProvisionedTPS` to `1`, or your low watermark for campaign recommendations requests, and let Personalize auto-scale campaign resources to meet actual demand.
 44 | 
 45 | See the Amazon Personalize [pricing page](https://aws.amazon.com/personalize/pricing/) for full details on costs.
 46 | 
 47 | ## <a name='Features'></a>Features
 48 | 
 49 | ### <a name='CloudWatchdashboard'></a>CloudWatch dashboard
 50 | 
 51 | When you deploy this application, a [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) is built with widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the campaigns you wish to monitor. The dashboard gives you critical visual information to assess how your campaigns are performing and being utilized. The data in these graphs can help you properly tune your campaign's `minProvisionedTPS`.
 52 | 
 53 | ![Personalize Monitor CloudWatch Dashboard](./images/personalize-monitor-cloudwatch-dashboard.png)
 54 | 
 55 | For more details on the CloudWatch dashboard created and maintained by this application, see the [dashboard_mgmt](./src/dashboard_mgmt_function/) function page.
 56 | 
 57 | ### <a name='CloudWatchalarms'></a>CloudWatch alarms
 58 | 
 59 | At deployment time, you can optionally have this application automatically create [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) that will alert you when a monitored campaign's utilization drops below a threshold you define for nine out of twelve evaluation periods. Since the intervals are 5 minutes, that means that nine of the 5 minute evaluations over a 1 hour span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. Similarly, the idle campaign alarm will alert you when there has been no request activity for a campaign for a configurable amount of time. The alarms will be setup to alert you via email through an SNS topic. Once the alarms are setup, you can alternatively link them to any operations and messaging tools you already use (i.e. Slack, PagerDuty, etc).
 60 | 
 61 | ![Personalize Monitor CloudWatch Alarms](./images/personalize-monitor-cloudwatch-alarms.png)
 62 | 
 63 | For more details on the CloudWatch alarms created by this application, see the [personalize_monitor](./src/personalize_monitor_function/) function page.
 64 | 
 65 | ### <a name='CloudWatchmetrics'></a>CloudWatch metrics
 66 | 
 67 | To support the CloudWatch dashboard and alarms described above, a few new custom [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) are added for the monitored campaigns. These metrics are populated by the [personalize_monitor](./src/personalize_monitor_function/) Lambda function that is setup to run every 5 minutes in your account. You can find these metrics in CloudWatch under Metrics in the "PersonalizeMonitor" namespace.
 68 | 
 69 | ![Personalize Monitor CloudWatch Metrics](./images/personalize-monitor-cloudwatch-metrics.png)
 70 | 
 71 | For more details on the custom metrics created by this application, see the [personalize_monitor](./src/personalize_monitor_function/) function page.
 72 | 
 73 | ### <a name='Costoptimizationoptional'></a>Cost optimization (optional)
 74 | 
 75 | This application can be optionally configured to automatically perform cost optimization actions for your Amazon Personalize campaigns.
 76 | 
 77 | #### <a name='Idlecampaigns'></a>Idle campaigns
 78 | Idle campaigns are those that have been provisioned but are not receiving any `GetRecommendations`/`GetPersonalizedRanking` calls. Since costs are incurred while a campaign is active regardless of whether it receives any requests, detecting and eliminating these idle campaigns can be an important cost optimization activity. This can be particularly useful in non-production AWS accounts such as development and testing where you are more likely to have abandoned campaigns. See the `AutoDeleteIdleCampaigns` and `IdleCampaignThresholdHours` deployment parameters in the installation instructions below and the [personalize_monitor](./src/personalize_monitor_function#automatically-deleting-idle-campaigns-optional) function for details.
 79 | 
 80 | #### <a name='Over-provisionedcampaigns'></a>Over-provisioned campaigns
 81 | 
 82 | Properly provisioning campaigns, as described earlier, is also an important cost optimization activity. This application can be configured to automatically reduce a campaign's `minProvisionedTPS` based on actual request volume. This will optimize a campaign's utilization when request volume is lower while relying on Personalize to auto-scale based on actual activity. See the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter below and the [personalize_monitor](./src/personalize_monitor_function#automatically-adjusting-campaign-minprovisionedtps-optional) function for details.
 83 | 
 84 | ## <a name='Architecture'></a>Architecture
 85 | 
 86 | The following diagram depicts how the Lambda functions in this application work together using an event-driven approach built on [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html). The [personalize_monitor](./src/personalize_monitor_function/) function is invoked every five minutes to generate CloudWatch metric data based on the monitored campaigns and create alarms (if configured). It also generates events which are published to EventBridge that trigger activities such as optimizing a campaign's `minProvisionedTPS`, deleting idle campaigns, updating the Personalize Monitor CloudWatch dashboard, and sending notifications. This approach allows you to more easily integrate these functions into your own operations by sending your own events, say, to trigger the dashboard to be rebuilt after you create a campaign or register your own targets to events generated by this application.
 87 | 
 88 | ![Personalize Monitor Architecture](./images/personalize-monitor-architecture.png)
 89 | 
 90 | See the readme pages for each function for details on the events that they produce and consume.
 91 | 
 92 | ## <a name='Installingtheapplication'></a>Installing the application
 93 | 
 94 | ***IMPORTANT NOTE:** Deploying this application in your AWS account will create and consume AWS resources, which will cost money. For example, the CloudWatch dashboard, the Lambda function that collects additional monitoring metrics is run every 5 minutes, CloudWatch alarms, logging, and so on. Therefore, if after installing this application you choose not to use it as part of your monitoring strategy, be sure to follow the Uninstall instructions below to clean up all resources and avoid ongoing charges.*
 95 | 
 96 | ### <a name='Option1-InstallfromServerlessApplicationRepository'></a>Option 1 - Install from Serverless Application Repository
 97 | 
 98 | The easiest way to deploy this application is from the [Serverless Application Repository](https://aws.amazon.com/serverless/serverlessrepo/) (SAR).
 99 | 
100 | 1. Within the AWS account where you wish to deploy the application, browse to the [application's page](https://serverlessrepo.aws.amazon.com/applications/arn:aws:serverlessrepo:us-east-1:316031960777:applications~Amazon-Personalize-Monitor) in the Serverless Application Repository and click **"Deploy"**.
101 | 2. Enter/update values in the **"Application settings"** panel (described below) and click **"Deploy"** again.
102 | 
103 | ### <a name='Option2-InstallusingServerlessApplicationModel'></a>Option 2 - Install using Serverless Application Model
104 | 
105 | If you'd rather install the application manually, you can use the AWS [Serverless Application Model](https://aws.amazon.com/serverless/sam/) (SAM) CLI to build and  deploy the application into your AWS account.
106 | 
107 | To use the SAM CLI, you need the following tools.
108 | 
109 | * SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html)
110 | * [Python 3 installed](https://www.python.org/downloads/)
111 | * Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community)
112 | 
113 | To build and deploy the application for the first time, run the following in your shell:
114 | 
115 | ```bash
116 | sam build --use-container --cached
117 | sam deploy --guided
118 | ```
119 | 
120 | The first command will build the source of the application. The second command will package and deploy the application to your AWS account with a series of prompts. The following section describes the supported application parameters.
121 | 
122 | ### <a name='Applicationsettingsparameters'></a>Application settings/parameters
123 | 
124 | Whether you install this application from SAR or SAM, the following parameters can be used to control how the application monitors your Personalize deployments.
125 | 
126 | | Parameter | Description | Default |
127 | | --- | --- | --- |
128 | | CampaignARNs | Comma separated list of Personalize campaign ARNs to monitor or `all` to monitor all active campaigns. It is recommended to use `all` so that any new campaigns that are added after deployment will be automatically detected, monitored, and have alarms created (optional) | `all` |
129 | | Regions | Comma separated list of AWS regions to monitor campaigns. Only applicable when `all` is used for `CampaignARNs`. Leaving this value blank will default to the region where this application is deployed (i.e. `AWS Region` parameter above). | |
130 | | AutoCreateCampaignUtilizationAlarms | Whether to automatically create a utilization CloudWatch alarm for each monitored campaign. | `Yes` |
131 | | CampaignThresholdAlarmLowerBound | Minimum threshold value (in percent) to enter alarm state for campaign utilization. This value is only relevant if `AutoCreateAlarms` is `Yes`. | `100` |
132 | | AutoAdjustCampaignMinProvisionedTPS | Whether to automatically compare campaign request activity against the campaign's `minProvisionedTPS` to determine if `minProvisionedTPS` can be reduced to optimize utilization. | `Yes` |
133 | | AutoCreateIdleCampaignAlarms | Whether to automatically create a idle detection CloudWatch alarm for each monitored campaign. | `Yes` |
134 | | IdleCampaignThresholdHours | Number of hours that a campaign must be idle (i.e. no requests) before it is automatically deleted. `AutoDeleteIdleCampaigns` must be `Yes` for idle campaign deletion to occur. | `24` |
135 | | AutoDeleteIdleCampaigns | Whether to automatically delete idle campaigns. An idle campaign is one that has not had any requests in `IdleCampaignThresholdHours` hours. | `No` |
136 | | NotificationEndpoint | Email address to receive alarm and ok notifications and campaign delete and update events (optional). An [SNS](https://aws.amazon.com/sns/) topic is created and this email address will be added as a subscriber to that topic. You will receive a confirmation email for the SNS topic subscription so be sure to click the confirmation link in that email to ensure you receive notifications. | |
137 | 
138 | ## <a name='Uninstallingtheapplication'></a>Uninstalling the application
139 | 
140 | If you installed the application from the Serverless Application Repository, you can delete the application from the Lambda console in your AWS account (under Applications).
141 | 
142 | Alternatively, if you installed the application using SAM, you can delete the application using the AWS CLI. Assuming you used the default application name for the stack name (`personalize-monitor`), you can run the following:
143 | 
144 | ```bash
145 | aws cloudformation delete-stack --stack-name personalize-monitor
146 | ```
147 | 
148 | You can also delete the application stack in CloudFormation in the AWS console.
149 | 
150 | ## <a name='FAQs'></a>FAQs
151 | 
152 | ***Q: Can I use this application to determine my accumulated inference charges during the month?***
153 | 
154 | ***A:*** No! Although the `actualTPS` and `minProvisionedTPS` custom metrics generated by this application may be used to calculate an approximation of your accumulated inference charges, it should **never** be used as a substitute or proxy for actual Personalize inference costs. Always consult your AWS Billing Dashboard for actual service charges.
155 | 
156 | ***Q: What is an ideal campaign utilization percentage? Is it okay if my campaign utilization is over 100%?***
157 | 
158 | ***A:*** The campaign utilization metric is a measure of your actual campaign usage compared against the `minProvisionedTPS` for the campaign. Any utilization value >= 100% is ideal since that means you are not over-provisioning, and therefore not over-paying, for campaign resources. You're letting Personalize handle the scaling in/out of the campaign. Anytime your utilization is below 100%, more resources are provisioned than are needed to satisfy the volume of requests at that time.
159 | 
160 | ***Q: How can I tell if Personalize is scaling out fast enough?***
161 | 
162 | ***A:*** Compare the "Actual vs Provisioned TPS" graph to the "Campaign Latency" graph on the Personalize Monitor CloudWatch dashboard. When your Actual TPS increases/spikes for a campaign, does the latency for the same campaign at the same time stay consistent? If so, this tells you that Personalize is maintaining response time as request volume increases and therefore scaling fast enough to meet demand. However, if latency increases significantly and to an unacceptable level for your application, this is an indication that Personalize may not be scaling fast enough. See the answer to the following question for some options.
163 | 
164 | ***Q: My workload is very spikey and Personalize is not scaling fast enough. What can I do?***
165 | 
166 | ***A:*** First, be sure to confirm that it is Personalize that is not scaling fast enough by reviewing the answer above. If the spikes are predictable or cyclical, you can pre-warm capacity in your campaign ahead of time by adjusting the `minProvisionedTPS` using the [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API and then dropping it back down after the traffic subsides. For example, increase capacity 30 minutes before a flash sale or marketing campaign is launched that brings a temporary surge in traffic. This can be done manually using the AWS console or automated by using [CloudWatch events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/WhatIsCloudWatchEvents.html) based on a schedule or triggered based on an event in your application. The [personalize_update_campaign_tps](./src/personalize_update_campaign_tps_function/) function that is deployed with this application can be used as the target for CloudWatch events or you can publish an `UpdatePersonalizeCampaignMinProvisionedTPS` event to EventBridge. If spikes in your workload are not predictable or known ahead of time, determining the optimal `minProvisionedTPS` to balance consistent latency vs cost is the best option. The metrics and dashboard graphs in this application can help you determine this value.
167 | 
168 | ***Q: After deploying this application in my AWS account, I created some new Personalize campaigns that I also want to monitor. How can I add them to be monitored and have them appear on my dashboard? Also, what about monitoried campaigns that I delete?***
169 | 
170 | ***A:*** If you specified `all` for the `CampaignARNs` deployment parameter (see installation instructions above), any new campaigns you create will be automatically monitored and alarms created (if `AutoCreateAlarms` was set to `Yes`) when the campaigns become active. Likewise, any campaigns that are deleted will no longer be monitored. If you want this application to monitor campaigns across multiple regions, be sure to specify the region names in the `Regions` deployment parameter. Note that this only applies when `CampaignARNs` is set to `all`. The CloudWatch dashboard will be automatically rebuilt ever hour to add new campaigns and drop deleted campaigns. You can also trigger the dashboard to be rebuilt by publishing a `BuildPersonalizeMonitorDashboard` event to the default EventBridge event bus (see [dashboard_mgmt_function](./src/dashboard_mgmt_function/)).
171 | 
172 | If you want to change your deployment parameters that control what campaigns are monitored, redeploy the application using the installation option selected above.
173 | 
174 | **IMPORTANT: Redeploying this application will fully rebuild and replace your Personalize Monitor dashboard so any changes you made manually to the dashboard will be lost.**
175 | 
176 | ## <a name='Reportingissues'></a>Reporting issues
177 | 
178 | If you encounter a bug, please create a new issue with as much detail as possible and steps for reproducing the bug. Similarly, if you have an idea for an improvement, please add an issue as well. Pull requests are also welcome! See the [Contributing Guidelines](./CONTRIBUTING.md) for more details.
179 | 
180 | ## <a name='Licensesummary'></a>License summary
181 | 
182 | This sample code is made available under a modified MIT license. See the LICENSE file.
183 | 


--------------------------------------------------------------------------------
/src/personalize_monitor_function/personalize_monitor.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: MIT-0
  3 | 
  4 | """Lambda function that records Personalize resource metrics
  5 | 
  6 | Lambda function designed to be called every five minutes to record campaign TPS 
  7 | utilization metrics in CloudWatch. The metrics are used for alarms and on the 
  8 | CloudWatch dashboard created by this application.
  9 | """
 10 | 
 11 | import json
 12 | import boto3
 13 | import os
 14 | import datetime
 15 | import sys
 16 | import math
 17 | 
 18 | from botocore.exceptions import ClientError
 19 | from aws_lambda_powertools import Logger
 20 | 
 21 | from common import (
 22 |     PROJECT_NAME,
 23 |     ALARM_NAME_PREFIX,
 24 |     extract_region,
 25 |     get_client,
 26 |     determine_campaign_arns,
 27 |     get_configured_active_campaigns,
 28 |     put_event
 29 | )
 30 | 
 31 | logger = Logger()
 32 | 
 33 | MAX_METRICS_PER_CALL = 20
 34 | MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS = 1
 35 | 
 36 | ALARM_PERIOD_SECONDS = 300
 37 | ALARM_NAME_PREFIX_LOW_UTILIZATION = ALARM_NAME_PREFIX + 'LowCampaignUtilization-'
 38 | ALARM_NAME_PREFIX_IDLE = ALARM_NAME_PREFIX + 'IdleCampaign-'
 39 | 
 40 | def get_campaign_recipe_arn(campaign):
 41 |     recipe_arn = campaign.get('recipeArn')
 42 |     if not recipe_arn:
 43 |         campaign_region = extract_region(campaign['campaignArn'])
 44 |         personalize = get_client('personalize', campaign_region)
 45 | 
 46 |         response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn'])
 47 | 
 48 |         recipe_arn = response['solutionVersion']['recipeArn']
 49 |         campaign['recipeArn'] = recipe_arn
 50 | 
 51 |     return recipe_arn
 52 | 
 53 | def get_campaign_inference_metric_name(campaign):
 54 |     metric_name = 'GetRecommendations'
 55 |     if get_campaign_recipe_arn(campaign) == 'arn:aws:personalize:::recipe/aws-personalized-ranking':
 56 |         metric_name = 'GetPersonalizedRanking'
 57 | 
 58 |     return metric_name
 59 | 
 60 | def get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period):
 61 |     campaign_region = extract_region(campaign['campaignArn'])
 62 |     cw = get_client(service_name = 'cloudwatch', region_name = campaign_region)
 63 | 
 64 |     metric_name = get_campaign_inference_metric_name(campaign)
 65 | 
 66 |     response = cw.get_metric_data(
 67 |         MetricDataQueries = [ 
 68 |             {
 69 |                 'Id': 'm1',
 70 |                 'MetricStat': {
 71 |                     'Metric': {
 72 |                         'Namespace': 'AWS/Personalize',
 73 |                         'MetricName': metric_name,
 74 |                         'Dimensions': [
 75 |                             {
 76 |                                 'Name': 'CampaignArn',
 77 |                                 'Value': campaign['campaignArn']
 78 |                             }
 79 |                         ]
 80 |                     },
 81 |                     'Period': period,
 82 |                     'Stat': 'Sum'
 83 |                 },
 84 |                 'ReturnData': True
 85 |             }
 86 |         ],
 87 |         StartTime = start_time,
 88 |         EndTime = end_time,
 89 |         ScanBy = 'TimestampDescending'
 90 |     )
 91 | 
 92 |     datapoints = []
 93 | 
 94 |     if response.get('MetricDataResults') and len(response['MetricDataResults']) > 0:
 95 |         results = response['MetricDataResults'][0]
 96 | 
 97 |         for idx, ts in enumerate(results['Timestamps']):
 98 |             datapoints.append({
 99 |                 'Timestamp': ts,
100 |                 'Value': results['Values'][idx]
101 |             })
102 | 
103 |     return datapoints
104 | 
105 | def get_campaign_sum_requests_by_hour(campaign, start_time, end_time):
106 |     datapoints = get_campaign_sum_requests_datapoints(campaign, start_time, end_time, 3600)
107 |     return datapoints
108 | 
109 | def get_campaign_total_requests(campaign, start_time, end_time, period):
110 |     datapoints = get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period)
111 | 
112 |     sum_requests = 0
113 |     if datapoints:
114 |         for datapoint in datapoints:
115 |             sum_requests += datapoint['Value']
116 |         
117 |     return sum_requests
118 | 
119 | def get_campaign_average_tps(campaign, start_time, end_time, period = ALARM_PERIOD_SECONDS):
120 |     sum_requests = get_campaign_total_requests(campaign, start_time, end_time, period)
121 |     return sum_requests / period
122 | 
123 | def get_campaign_age_hours(campaign):
124 |     diff = datetime.datetime.now(datetime.timezone.utc) - campaign['creationDateTime']
125 |     days, seconds = diff.days, diff.seconds
126 | 
127 |     hours_age = days * 24 + seconds // 3600
128 |     return hours_age
129 | 
130 | def get_campaign_last_update_age_hours(campaign):
131 |     hours_age = None
132 |     if campaign.get('lastUpdatedDateTime'):
133 |         diff = datetime.datetime.now(datetime.timezone.utc) - campaign['lastUpdatedDateTime']
134 |         days, seconds = diff.days, diff.seconds
135 | 
136 |         hours_age = days * 24 + seconds // 3600
137 |     return hours_age
138 | 
139 | def is_campaign_updatable(campaign):
140 |     status = campaign['status']
141 |     updatable = status == 'ACTIVE' or status == 'CREATE FAILED'
142 | 
143 |     if updatable and campaign.get('latestCampaignUpdate'):
144 |         status = campaign['latestCampaignUpdate']['status']
145 |         updatable = status == 'ACTIVE' or status == 'CREATE FAILED'
146 | 
147 |     return updatable
148 | 
149 | def put_metrics(client, metric_datas):
150 |     metric = {
151 |         'Namespace': PROJECT_NAME,
152 |         'MetricData': metric_datas
153 |     }
154 |     
155 |     client.put_metric_data(**metric)
156 |     logger.debug('Put data for %d metrics', len(metric_datas))
157 | 
158 | def append_metric(metric_datas_by_region, region, metric):
159 |     metric_datas = metric_datas_by_region.get(region)
160 | 
161 |     if not metric_datas:
162 |         metric_datas = []
163 |         metric_datas_by_region[region] = metric_datas
164 | 
165 |     metric_datas.append(metric)
166 | 
167 | def create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound):
168 |     cw = get_client(service_name = 'cloudwatch', region_name = campaign_region)
169 | 
170 |     response = cw.describe_alarms_for_metric(
171 |         MetricName = 'campaignUtilization',
172 |         Namespace = PROJECT_NAME,
173 |         Dimensions=[
174 |             {
175 |                 'Name': 'CampaignArn',
176 |                 'Value': campaign['campaignArn']
177 |             },
178 |         ]
179 |     )
180 | 
181 |     alarm_name = ALARM_NAME_PREFIX_LOW_UTILIZATION + campaign['name']
182 | 
183 |     low_utilization_alarm_exists = False
184 |     # Only enable alarm actions when minTPS > 1 since we can't really do 
185 |     # anything to impact utilization by dropping minTPS. Let the idle 
186 |     # campaign alarm handle abandoned campaigns. 
187 |     enable_actions = campaign['minProvisionedTPS'] > 1
188 |     actions_currently_enabled = False
189 | 
190 |     for alarm in response['MetricAlarms']:
191 |         if (alarm['AlarmName'].startswith(ALARM_NAME_PREFIX_LOW_UTILIZATION) and
192 |                 alarm['ComparisonOperator'] in [ 'LessThanThreshold', 'LessThanOrEqualToThreshold' ]):
193 |             alarm_name = alarm['AlarmName']
194 |             low_utilization_alarm_exists = True
195 |             actions_currently_enabled = alarm['ActionsEnabled']
196 |             break
197 | 
198 |     alarm_created = False
199 | 
200 |     if not low_utilization_alarm_exists:
201 |         logger.info('Creating lower bound utilization alarm for %s', campaign['campaignArn'])
202 | 
203 |         topic_arn = os.environ['NotificationsTopic']
204 | 
205 |         cw.put_metric_alarm(
206 |             AlarmName = alarm_name,
207 |             AlarmDescription = 'Alarms when campaign utilization falls below threashold indicating possible over provisioning condition',
208 |             ActionsEnabled = enable_actions,
209 |             OKActions = [ topic_arn ],
210 |             AlarmActions = [ topic_arn ],
211 |             MetricName = 'campaignUtilization',
212 |             Namespace = PROJECT_NAME,
213 |             Statistic = 'Average',
214 |             Dimensions = [
215 |                 {
216 |                     'Name': 'CampaignArn',
217 |                     'Value': campaign['campaignArn']
218 |                 }
219 |             ],
220 |             Period = ALARM_PERIOD_SECONDS,
221 |             EvaluationPeriods = 12, # last 60 minutes
222 |             DatapointsToAlarm = 9,  # alarm state for 45 of last 60 minutes
223 |             Threshold = utilization_threshold_lower_bound,
224 |             ComparisonOperator = 'LessThanThreshold',
225 |             TreatMissingData = 'missing',
226 |             Tags=[
227 |                 {
228 |                     'Key': 'CreatedBy',
229 |                     'Value': PROJECT_NAME
230 |                 }
231 |             ]
232 |         )
233 | 
234 |         alarm_created = True
235 |     elif enable_actions != actions_currently_enabled:
236 |         # Toggle enable/disable actions for existing alarm.
237 |         if enable_actions:
238 |             cw.enable_alarm_actions(AlarmNames = [ alarm_name ])
239 |         else:
240 |             cw.disable_alarm_actions(AlarmNames = [ alarm_name ])
241 | 
242 |     return alarm_created
243 | 
244 | def create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours):
245 |     cw = get_client(service_name = 'cloudwatch', region_name = campaign_region)
246 |     topic_arn = os.environ['NotificationsTopic']
247 | 
248 |     metric_name = get_campaign_inference_metric_name(campaign)
249 | 
250 |     response = cw.describe_alarms_for_metric(
251 |         MetricName = metric_name,
252 |         Namespace = 'AWS/Personalize',
253 |         Dimensions=[
254 |             {
255 |                 'Name': 'CampaignArn',
256 |                 'Value': campaign['campaignArn']
257 |             },
258 |         ]
259 |     )
260 | 
261 |     alarm_name = ALARM_NAME_PREFIX_IDLE + campaign['name']
262 | 
263 |     idle_alarm_exists = False
264 |     # Only enable actions when the campaign has existed at least as long as 
265 |     # the idle threshold. This is necessary since the alarm treats missing 
266 |     # data as breaching.
267 |     enable_actions = get_campaign_age_hours(campaign) >= idle_campaign_threshold_hours
268 |     actions_currently_enabled = False
269 | 
270 |     for alarm in response['MetricAlarms']:
271 |         if (alarm['AlarmName'].startswith(ALARM_NAME_PREFIX_IDLE) and
272 |                 alarm['ComparisonOperator'] == 'LessThanOrEqualToThreshold' and
273 |                 int(alarm['Threshold']) == 0):
274 |             alarm_name = alarm['AlarmName']
275 |             idle_alarm_exists = True
276 |             actions_currently_enabled = alarm['ActionsEnabled']
277 |             break
278 | 
279 |     alarm_created = False
280 | 
281 |     if not idle_alarm_exists:
282 |         logger.info('Creating idle utilization alarm for %s', campaign['campaignArn'])
283 | 
284 |         cw.put_metric_alarm(
285 |             AlarmName = alarm_name,
286 |             AlarmDescription = 'Alarms when campaign utilization is idle for continguous length of time indicating potential abandoned campaign',
287 |             ActionsEnabled = enable_actions,
288 |             OKActions = [ topic_arn ],
289 |             AlarmActions = [ topic_arn ],
290 |             MetricName = metric_name,
291 |             Namespace = 'AWS/Personalize',
292 |             Statistic = 'Sum',
293 |             Dimensions = [
294 |                 {
295 |                     'Name': 'CampaignArn',
296 |                     'Value': campaign['campaignArn']
297 |                 }
298 |             ],
299 |             Period = ALARM_PERIOD_SECONDS,
300 |             EvaluationPeriods = int(((60 * 60) / ALARM_PERIOD_SECONDS) * idle_campaign_threshold_hours),
301 |             Threshold = 0,
302 |             ComparisonOperator = 'LessThanOrEqualToThreshold',
303 |             TreatMissingData = 'breaching', # Won't get metric data for idle campaigns
304 |             Tags=[
305 |                 {
306 |                     'Key': 'CreatedBy',
307 |                     'Value': PROJECT_NAME
308 |                 }
309 |             ]
310 |         )
311 | 
312 |         alarm_created = True
313 |     elif enable_actions != actions_currently_enabled:
314 |         # Toggle enable/disable actions for existing alarm.
315 |         if enable_actions:
316 |             cw.enable_alarm_actions(AlarmNames = [ alarm_name ])
317 |         else:
318 |             cw.disable_alarm_actions(AlarmNames = [ alarm_name ])
319 | 
320 |     return alarm_created
321 | 
322 | def divide_chunks(l, n): 
323 |     for i in range(0, len(l), n):  
324 |         yield l[i:i + n]
325 | 
326 | def perform_hourly_checks(campaign_arn):
327 |     ''' Hashes campaign_arn across 10 minute intervals of the current hour so we spread out campaign hourly checks '''
328 |     num_slots = 6  # 60 mins / 10
329 |     slot = sum(bytearray(campaign_arn.encode('utf-8'))) % num_slots
330 |     # Allow for match on first two minutes of 10 minute slot to account for CW event lag (assumes current schedule of every 5 mins).
331 |     return datetime.datetime.now().minute in range(slot * 10, slot * 10 + 2)
332 | 
333 | @logger.inject_lambda_context(log_event=True)
334 | def lambda_handler(event, context):
335 |     auto_create_utilization_alarms = event.get('AutoCreateCampaignUtilizationAlarms')
336 |     if not auto_create_utilization_alarms:
337 |         auto_create_utilization_alarms = os.environ.get('AutoCreateCampaignUtilizationAlarms', 'yes').lower() in [ 'true', 'yes', '1' ]
338 | 
339 |     utilization_threshold_lower_bound = event.get('CampaignThresholdAlarmLowerBound')
340 |     if not utilization_threshold_lower_bound:
341 |         utilization_threshold_lower_bound = float(os.environ.get('CampaignThresholdAlarmLowerBound', '100.0'))
342 | 
343 |     auto_create_idle_alarms = event.get('AutoCreateIdleCampaignAlarms')
344 |     if not auto_create_idle_alarms:
345 |         auto_create_idle_alarms = os.environ.get('AutoCreateIdleCampaignAlarms', 'yes').lower() in [ 'true', 'yes', '1' ]
346 | 
347 |     auto_delete_idle_campaigns = event.get('AutoDeleteIdleCampaigns')
348 |     if not auto_delete_idle_campaigns:
349 |         auto_delete_idle_campaigns = os.environ.get('AutoDeleteIdleCampaigns', 'false').lower() in [ 'true', 'yes', '1' ]
350 | 
351 |     idle_campaign_threshold_hours = event.get('IdleCampaignThresholdHours')
352 |     if not idle_campaign_threshold_hours:
353 |         idle_campaign_threshold_hours = int(os.environ.get('IdleCampaignThresholdHours', '24'))
354 | 
355 |     if idle_campaign_threshold_hours < MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS:
356 |         raise ValueError(f'"IdleCampaignThresholdHours" must be >= {MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS} hours')
357 | 
358 |     auto_adjust_campaign_tps = event.get('AutoAdjustCampaignMinProvisionedTPS')
359 |     if not auto_adjust_campaign_tps:
360 |         auto_adjust_campaign_tps = os.environ.get('AutoAdjustCampaignMinProvisionedTPS', 'yes').lower() in [ 'true', 'yes', '1' ]
361 | 
362 |     campaigns = get_configured_active_campaigns(event)
363 |     
364 |     logger.info('Retrieving minProvisionedTPS for %d active campaigns', len(campaigns))
365 | 
366 |     current_region = os.environ['AWS_REGION']
367 |     
368 |     metric_datas_by_region = {}
369 | 
370 |     append_metric(metric_datas_by_region, current_region, {
371 |         'MetricName': 'monitoredCampaignCount',
372 |         'Value': len(campaigns),
373 |         'Unit': 'Count'
374 |     })
375 |     
376 |     campaign_metrics_written = 0
377 |     all_metrics_written = 0
378 |     alarms_created = 0
379 | 
380 |     # Define our 5 minute window, ensuring it's on prior 5 minute boundary.
381 |     end_time = datetime.datetime.now(datetime.timezone.utc)
382 |     end_time = end_time.replace(microsecond=0,second=0, minute=end_time.minute - end_time.minute % 5)
383 |     start_time = end_time - datetime.timedelta(minutes=5)
384 | 
385 |     for campaign in campaigns:
386 |         campaign_arn = campaign['campaignArn']
387 |         campaign_region = extract_region(campaign_arn)
388 | 
389 |         min_provisioned_tps = campaign['minProvisionedTPS']
390 |         
391 |         append_metric(metric_datas_by_region, campaign_region, {
392 |             'MetricName': 'minProvisionedTPS',
393 |             'Dimensions': [
394 |                 {
395 |                     'Name': 'CampaignArn',
396 |                     'Value': campaign_arn
397 |                 }
398 |             ],
399 |             'Value': min_provisioned_tps,
400 |             'Unit': 'Count/Second'
401 |         })
402 |         
403 |         tps = get_campaign_average_tps(campaign, start_time, end_time)
404 |         utilization = 0
405 | 
406 |         if tps:
407 |             append_metric(metric_datas_by_region, campaign_region, {
408 |                 'MetricName': 'averageTPS',
409 |                 'Dimensions': [
410 |                     {
411 |                         'Name': 'CampaignArn',
412 |                         'Value': campaign_arn
413 |                     }
414 |                 ],
415 |                 'Value': tps,
416 |                 'Unit': 'Count/Second'
417 |             })
418 |             
419 |             utilization = tps / min_provisioned_tps * 100
420 | 
421 |         append_metric(metric_datas_by_region, campaign_region, {
422 |             'MetricName': 'campaignUtilization',
423 |             'Dimensions': [
424 |                 {
425 |                     'Name': 'CampaignArn',
426 |                     'Value': campaign_arn
427 |                 }
428 |             ],
429 |             'Value': utilization,
430 |             'Unit': 'Percent'
431 |         })
432 |             
433 |         logger.debug(
434 |             'Campaign %s has current minProvisionedTPS of %d and actual TPS of %s yielding %.2f%% utilization', 
435 |             campaign_arn, min_provisioned_tps, tps, utilization
436 |         )
437 |         campaign_metrics_written += 1
438 | 
439 |         # Only do idle campaign and minProvisionedTPS adjustment checks once per hour for each campaign.
440 |         perform_hourly_checks_this_run = perform_hourly_checks(campaign_arn)
441 | 
442 |         # Determine how old the campaign is and time since last update.
443 |         campaign_age_hours = get_campaign_age_hours(campaign)
444 |         campaign_update_age_hours = get_campaign_last_update_age_hours(campaign)
445 | 
446 |         campaign_delete_event_fired = False
447 | 
448 |         if utilization == 0 and perform_hourly_checks_this_run and auto_delete_idle_campaigns:
449 |             # Campaign is currently idle. Let's see if it's old enough and not being updated recently.
450 |             logger.info(
451 |                 'Performing idle delete check for campaign %s; campaign is %d hours old; last updated %s hours ago', 
452 |                 campaign_arn, campaign_age_hours, campaign_update_age_hours
453 |             )
454 | 
455 |             if (campaign_age_hours >= idle_campaign_threshold_hours):
456 | 
457 |                 # Campaign has been around long enough. Let's see how long it's been idle.
458 |                 end_time_idle_check = datetime.datetime.now(datetime.timezone.utc)
459 |                 start_time_idle_check = end_time_idle_check - datetime.timedelta(hours = idle_campaign_threshold_hours)
460 |                 period_idle_check = idle_campaign_threshold_hours * 60 * 60
461 | 
462 |                 total_requests = get_campaign_total_requests(campaign, start_time_idle_check, end_time_idle_check, period_idle_check)
463 | 
464 |                 if total_requests == 0:
465 |                     if is_campaign_updatable(campaign):
466 |                         reason = f'Campaign {campaign_arn} has been idle for at least {idle_campaign_threshold_hours} hours so initiating delete according to configuration.'
467 | 
468 |                         logger.info(reason)
469 | 
470 |                         put_event(
471 |                             detail_type = 'DeletePersonalizeCampaign',
472 |                             detail = json.dumps({
473 |                                 'CampaignARN': campaign_arn,
474 |                                 'CampaignUtilization': utilization,
475 |                                 'CampaignAgeHours': campaign_age_hours,
476 |                                 'IdleCampaignThresholdHours': idle_campaign_threshold_hours,
477 |                                 'TotalRequestsDuringIdleThresholdHours': total_requests,
478 |                                 'Reason': reason
479 |                             }),
480 |                             resources = [ campaign_arn ]
481 |                         )
482 | 
483 |                         campaign_delete_event_fired = True
484 |                     else:
485 |                         logger.warn(
486 |                             'Campaign %s has been idle for at least %d hours but its status will not allow it to be deleted on this run', 
487 |                             campaign_arn, idle_campaign_threshold_hours
488 |                         )
489 |                 else:
490 |                     logger.warn(
491 |                         'Campaign %s is currently idle but has had %d requests within the last %d hours so does not meet idle criteria for auto-deletion', 
492 |                         campaign_arn, total_requests, idle_campaign_threshold_hours
493 |                     )
494 |             else:
495 |                 logger.info(
496 |                     'Campaign %s is only %d hours old and last update %s hours old; too new to consider for auto-deletion', 
497 |                     campaign_arn, campaign_age_hours, campaign_update_age_hours
498 |                 )
499 | 
500 |         if (not campaign_delete_event_fired and 
501 |                 perform_hourly_checks_this_run and 
502 |                 auto_adjust_campaign_tps and 
503 |                 min_provisioned_tps > 1):
504 | 
505 |             days_back = 14
506 |             end_time_tps_check = datetime.datetime.now(datetime.timezone.utc).replace(minute=0, second=0, microsecond=0)
507 |             start_time_tps_check = end_time_tps_check - datetime.timedelta(days = days_back)
508 | 
509 |             datapoints = get_campaign_sum_requests_by_hour(campaign, start_time_tps_check, end_time_tps_check)
510 |             min_reqs = sys.maxsize
511 |             max_reqs = total_reqs = total_avg_tps = min_avg_tps = max_avg_tps = 0
512 | 
513 |             for datapoint in datapoints:
514 |                 total_reqs += datapoint['Value']
515 |                 min_reqs = min(min_reqs, datapoint['Value'])
516 |                 max_reqs = max(max_reqs, datapoint['Value'])
517 | 
518 |             if len(datapoints) > 0:
519 |                 total_avg_tps = int(total_reqs / (len(datapoints) * 3600))
520 |                 min_avg_tps = int(min_reqs / 3600)
521 |                 max_avg_tps = int(max_reqs / 3600)
522 | 
523 |             logger.info(
524 |                 'Performing minProvisionedTPS adjustment check for campaign %s; min/max/avg hourly TPS over last %d days for %d datapoints: %d/%d/%.2f', 
525 |                 campaign_arn, days_back, len(datapoints), min_avg_tps, max_avg_tps, total_avg_tps
526 |             )
527 | 
528 |             min_age_to_update_hours = 24
529 | 
530 |             age_eligible = True
531 | 
532 |             if campaign_age_hours < min_age_to_update_hours:
533 |                 logger.info(
534 |                     'Campaign %s is less than %d hours old so not eligible for minProvisionedTPS adjustment yet', 
535 |                     campaign_arn, min_age_to_update_hours
536 |                 )
537 |                 age_eligible = False
538 | 
539 |             if age_eligible and min_avg_tps < min_provisioned_tps:
540 |                 # Incrementally drop minProvisionedTPS.
541 |                 new_min_tps = max(1, int(math.floor(min_provisioned_tps * .75)))
542 | 
543 |                 if is_campaign_updatable(campaign):
544 |                     reason = f'Step down adjustment of minProvisionedTPS for campaign {campaign_arn} down from {min_provisioned_tps} to {new_min_tps} based on average hourly TPS low watermark of {min_avg_tps} over last {days_back} days'
545 |                     logger.info(reason)
546 | 
547 |                     put_event(
548 |                         detail_type = 'UpdatePersonalizeCampaignMinProvisionedTPS',
549 |                         detail = json.dumps({
550 |                             'CampaignARN': campaign_arn,
551 |                             'CampaignUtilization': utilization,
552 |                             'CampaignAgeHours': campaign_age_hours,
553 |                             'CurrentProvisionedTPS': min_provisioned_tps,
554 |                             'MinProvisionedTPS': new_min_tps,
555 |                             'MinAverageTPS': min_avg_tps,
556 |                             'MaxAverageTPS': max_avg_tps,
557 |                             'Datapoints': datapoints,
558 |                             'Reason': reason
559 |                         }, default = str),
560 |                         resources = [ campaign_arn ]
561 |                     )
562 |                 else:
563 |                     logger.warn(
564 |                         'Campaign %s could have its minProvisionedTPS adjusted down from %d to %d based on average hourly TPS low watermark over last %d days but its status will not allow it to be updated on this run', 
565 |                         campaign_arn, min_provisioned_tps, new_min_tps, days_back
566 |                     )
567 | 
568 |         if not campaign_delete_event_fired:
569 |             if auto_create_utilization_alarms:
570 |                 if create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound):
571 |                     alarms_created += 1
572 | 
573 |             if auto_create_idle_alarms:
574 |                 if create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours):
575 |                     alarms_created += 1
576 | 
577 |     for region, metric_datas in metric_datas_by_region.items():
578 |         cw = get_client(service_name = 'cloudwatch', region_name = region)
579 | 
580 |         metric_datas_chunks = divide_chunks(metric_datas, MAX_METRICS_PER_CALL)
581 | 
582 |         for metrics_datas_chunk in metric_datas_chunks:
583 |             put_metrics(cw, metrics_datas_chunk)
584 |             all_metrics_written += len(metrics_datas_chunk)
585 | 
586 |     outcome = f'Logged {all_metrics_written} TPS utilization metrics for {campaign_metrics_written} active campaigns; {alarms_created} alarms created'
587 |     logger.info(outcome)
588 | 
589 |     if alarms_created > 0:
590 |         # At least one new alarm was created so that likely means new campaigns were created too. Let's trigger the dashboard to be rebuilt.
591 |         logger.info('Triggering rebuild of the CloudWatch dashboard since %d new alarm(s) were created', alarms_created)
592 |         put_event(
593 |             detail_type = 'BuildPersonalizeMonitorDashboard',
594 |             detail = json.dumps({
595 |                 'Reason': f'Triggered rebuild due to {alarms_created} new alarm(s) being created'
596 |             })
597 |         )
598 | 
599 |     return outcome
600 | 


--------------------------------------------------------------------------------