├── src ├── layer │ ├── __init__.py │ ├── requirements.txt │ ├── README.md │ └── common.py ├── cleanup_resources_function │ ├── __init__.py │ ├── requirements.txt │ ├── README.md │ └── cleanup_resources.py ├── dashboard_mgmt_function │ ├── __init__.py │ ├── requirements.txt │ ├── dashboard-template.mustache │ ├── README.md │ └── dashboard_mgmt.py ├── personalize_monitor_function │ ├── __init__.py │ ├── requirements.txt │ ├── README.md │ └── personalize_monitor.py ├── personalize_delete_campaign_function │ ├── __init__.py │ ├── requirements.txt │ ├── personalize_delete_campaign.py │ └── README.md └── personalize_update_campaign_tps_function │ ├── __init__.py │ ├── requirements.txt │ ├── personalize_update_campaign_tps.py │ └── README.md ├── .gitignore ├── images ├── personalize-monitor-architecture.png ├── personalize-monitor-cloudwatch-alarms.png ├── personalize-monitor-cloudwatch-dashboard.png └── personalize-monitor-cloudwatch-metrics.png ├── .github └── PULL_REQUEST_TEMPLATE.md ├── CODE_OF_CONDUCT.md ├── samconfig.toml ├── sar-publish.sh ├── LICENSE ├── CONTRIBUTING.md ├── README-SAR.md ├── template.yaml └── README.md /src/layer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/cleanup_resources_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dashboard_mgmt_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/personalize_monitor_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/personalize_delete_campaign_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/personalize_update_campaign_tps_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | .vscode 4 | .aws-sam 5 | env -------------------------------------------------------------------------------- /src/cleanup_resources_function/requirements.txt: -------------------------------------------------------------------------------- 1 | # Dependencies bundled in layer -------------------------------------------------------------------------------- /src/personalize_monitor_function/requirements.txt: -------------------------------------------------------------------------------- 1 | # Dependencies bundled in layer -------------------------------------------------------------------------------- /src/personalize_delete_campaign_function/requirements.txt: -------------------------------------------------------------------------------- 1 | # Dependencies bundled in layer -------------------------------------------------------------------------------- /src/personalize_update_campaign_tps_function/requirements.txt: -------------------------------------------------------------------------------- 1 | # Dependencies bundled in layer -------------------------------------------------------------------------------- /src/dashboard_mgmt_function/requirements.txt: -------------------------------------------------------------------------------- 1 | # Other dependencies bundled in layer 2 | chevron==0.13.1 -------------------------------------------------------------------------------- /src/layer/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-lambda-powertools==1.6.1 2 | crhelper==2.0.6 3 | expiring-dict==1.1.0 -------------------------------------------------------------------------------- /images/personalize-monitor-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-architecture.png -------------------------------------------------------------------------------- /images/personalize-monitor-cloudwatch-alarms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-alarms.png -------------------------------------------------------------------------------- /images/personalize-monitor-cloudwatch-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-dashboard.png -------------------------------------------------------------------------------- /images/personalize-monitor-cloudwatch-metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-metrics.png -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 7 | -------------------------------------------------------------------------------- /src/cleanup_resources_function/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - Cleanup Function 2 | 3 | This Lambda function is called as a CloudFormation custom resource when the application is deleted/uninstalled so that resources created dynamically by the application, such as CloudWatch alarms, are also deleted. -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "personalize-monitor" 6 | s3_prefix = "personalize-monitor" 7 | parameter_overrides = "CampaignARNs=\"all\" AutoCreateCampaignUtilizationAlarms=\"Yes\" CampaignThresholdAlarmLowerBound=\"100\" AutoCreateIdleCampaignAlarms=\"Yes\"" 8 | capabilities = "CAPABILITY_IAM" -------------------------------------------------------------------------------- /src/layer/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - Common Lambda Layer 2 | 3 | This [Lambda Layer](https://docs.aws.amazon.com/lambda/latest/dg/configuration-layers.html) includes dependencies shared across all/most functions in this application. In addition, the [common.py](./common.py) file includes utility functions that are also shared across the Lambda functions in this application. -------------------------------------------------------------------------------- /sar-publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Utility script to deploy application to the Serverless Application Repository. 4 | 5 | set -e 6 | 7 | # Bucket must have policy to allow SAR access. 8 | # See https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-template-publishing-applications.html 9 | BUCKET=$1 10 | REGION=$2 11 | 12 | if [ "$BUCKET" == "" ] || [ "$REGION" == "" ]; then 13 | echo "Usage: $0 BUCKET REGION" 14 | echo " where BUCKET is the S3 bucket to deploy packaged resources for SAR and REGION is the AWS region where to publish the application" 15 | exit 1 16 | fi 17 | 18 | echo "Building application" 19 | sam build --use-container --cached 20 | 21 | cd .aws-sam/build 22 | echo "Packaging application" 23 | sam package --template-file template.yaml --output-template-file packaged.yaml --s3-bucket $BUCKET 24 | echo "Publishing application to the SAR" 25 | sam publish --template packaged.yaml --region $REGION 26 | cd - -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /src/cleanup_resources_function/cleanup_resources.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """Cleans up resources created by this application outside of CloudFormation 5 | 6 | This function is called as a CloudFormation custom resource. 7 | """ 8 | 9 | from crhelper import CfnResource 10 | from aws_lambda_powertools import Logger 11 | 12 | from common import ( 13 | PROJECT_NAME, 14 | ALARM_NAME_PREFIX, 15 | extract_region, 16 | get_client, 17 | determine_campaign_arns 18 | ) 19 | 20 | logger = Logger() 21 | helper = CfnResource() 22 | 23 | @helper.delete 24 | def delete_resource(event, _): 25 | campaign_arns = determine_campaign_arns(event.get('ResourceProperties')) 26 | 27 | logger.debug('Campaigns to check for resources to delete: %s', campaign_arns) 28 | 29 | regions = set() 30 | 31 | for campaign_arn in campaign_arns: 32 | regions.add(extract_region(campaign_arn)) 33 | 34 | logger.debug('Regions to check for resources to delete: %s', regions) 35 | 36 | alarms_deleted = 0 37 | 38 | for region in regions: 39 | cw = get_client(service_name = 'cloudwatch', region_name = region) 40 | 41 | alarm_names_to_delete = set() 42 | 43 | alarms_paginator = cw.get_paginator('describe_alarms') 44 | for alarms_page in alarms_paginator.paginate(AlarmNamePrefix = ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']): 45 | for alarm in alarms_page['MetricAlarms']: 46 | tags_response = cw.list_tags_for_resource(ResourceARN = alarm['AlarmArn']) 47 | 48 | for tag in tags_response['Tags']: 49 | if tag['Key'] == 'CreatedBy' and tag['Value'] == PROJECT_NAME: 50 | alarm_names_to_delete.add(alarm['AlarmName']) 51 | break 52 | 53 | if alarm_names_to_delete: 54 | # FUTURE: max check of 100 55 | logger.info('Deleting CloudWatch alarms in %s for campaigns %s: %s', region, campaign_arns, alarm_names_to_delete) 56 | cw.delete_alarms(AlarmNames=list(alarm_names_to_delete)) 57 | alarms_deleted += len(alarm_names_to_delete) 58 | 59 | logger.info('Deleted %d alarms', alarms_deleted) 60 | 61 | @logger.inject_lambda_context(log_event=True) 62 | def lambda_handler(event, context): 63 | helper(event, context) -------------------------------------------------------------------------------- /src/personalize_update_campaign_tps_function/personalize_update_campaign_tps.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """ 5 | Utility Lambda function that can be used to update a Personalize campaign's minProvisionedTPS value 6 | based on triggers such as CloudWatch event rules (i.e. cron) or application events. 7 | """ 8 | 9 | import json 10 | import boto3 11 | import os 12 | import json 13 | import logging 14 | 15 | from aws_lambda_powertools import Logger 16 | 17 | from common import ( 18 | extract_region, 19 | get_client, 20 | put_event 21 | ) 22 | 23 | logger = Logger() 24 | 25 | @logger.inject_lambda_context(log_event=True) 26 | def lambda_handler(event, context): 27 | ''' Updates the minProvisionedTPS value for an existing Personalize campaign ''' 28 | if event.get('detail'): 29 | campaign_arn = event['detail']['CampaignARN'] 30 | min_tps = event['detail']['MinProvisionedTPS'] 31 | reason = event['detail']['Reason'] 32 | else: 33 | campaign_arn = event['CampaignARN'] 34 | min_tps = event['MinProvisionedTPS'] 35 | reason = event.get('Reason') 36 | 37 | if min_tps < 1: 38 | raise ValueError(f'"MinProvisionedTPS" must be >= 1') 39 | 40 | region = extract_region(campaign_arn) 41 | if not region: 42 | raise Exception('Region could not be extracted from campaign_arn') 43 | 44 | personalize = get_client(service_name = 'personalize', region_name = region) 45 | 46 | response = personalize.update_campaign(campaignArn = campaign_arn, minProvisionedTPS = min_tps) 47 | 48 | if logger.isEnabledFor(logging.DEBUG): 49 | logger.debug(json.dumps(response, indent = 2, default = str)) 50 | 51 | if not reason: 52 | reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)' 53 | 54 | put_event( 55 | detail_type = 'PersonalizeCampaignMinProvisionedTPSUpdated', 56 | detail = json.dumps({ 57 | 'CampaignARN': campaign_arn, 58 | 'NewMinProvisionedTPS': min_tps, 59 | 'Reason': reason 60 | }), 61 | resources = [ campaign_arn ] 62 | ) 63 | 64 | logger.info({ 65 | 'campaignArn': campaign_arn, 66 | 'minProvisionedTPS': min_tps 67 | }) 68 | 69 | return f'Successfully initiated update of minProvisionedTPS to {min_tps} for campaign {campaign_arn}' -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /src/personalize_delete_campaign_function/personalize_delete_campaign.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """ 5 | Lambda function that is used to delete a Personalize campaign based on prolonged idle time 6 | and according to configuration to automatically delete campaigns under these conditions. 7 | """ 8 | 9 | import json 10 | import boto3 11 | import os 12 | import json 13 | import logging 14 | 15 | from aws_lambda_powertools import Logger 16 | 17 | from common import ( 18 | extract_region, 19 | get_client, 20 | put_event 21 | ) 22 | 23 | logger = Logger() 24 | 25 | def delete_alarms_for_campaign(campaign_arn): 26 | cw = get_client(service_name = 'cloudwatch', region_name = extract_region(campaign_arn)) 27 | 28 | alarm_names_to_delete = set() 29 | 30 | alarms_paginator = cw.get_paginator('describe_alarms') 31 | for alarms_page in alarms_paginator.paginate(AlarmNamePrefix = ALARM_NAME_PREFIX, AlarmTypes=['MetricAlarm']): 32 | for alarm in alarms_page['MetricAlarms']: 33 | for dim in alarm['Dimensions']: 34 | if dim['Name'] == 'CampaignArn' and dim['Value'] == campaign_arn: 35 | tags_response = cw.list_tags_for_resource(ResourceARN = alarm['AlarmArn']) 36 | 37 | for tag in tags_response['Tags']: 38 | if tag['Key'] == 'CreatedBy' and tag['Value'] == PROJECT_NAME: 39 | alarm_names_to_delete.add(alarm['AlarmName']) 40 | break 41 | 42 | if alarm_names_to_delete: 43 | # FUTURE: max check of 100 44 | logger.info('Deleting CloudWatch alarms for campaign %s: %s', campaign_arn, alarm_names_to_delete) 45 | cw.delete_alarms(AlarmNames=list(alarm_names_to_delete)) 46 | alarms_deleted += len(alarm_names_to_delete) 47 | else: 48 | logger.info('No CloudWatch alarms to delete for campaign %s', campaign_arn) 49 | 50 | @logger.inject_lambda_context(log_event=True) 51 | def lambda_handler(event, context): 52 | ''' Initiates the delete of a Personalize campaign ''' 53 | if event.get('detail'): 54 | campaign_arn = event['detail']['CampaignARN'] 55 | reason = event['detail']['Reason'] 56 | else: 57 | campaign_arn = event['CampaignARN'] 58 | reason = event.get('Reason') 59 | 60 | region = extract_region(campaign_arn) 61 | if not region: 62 | raise Exception('Region could not be extracted from campaign_arn') 63 | 64 | personalize = get_client(service_name = 'personalize', region_name = region) 65 | 66 | response = personalize.delete_campaign(campaignArn = campaign_arn) 67 | 68 | if logger.isEnabledFor(logging.DEBUG): 69 | logger.debug(json.dumps(response, indent = 2, default = str)) 70 | 71 | if not reason: 72 | reason = f'Amazon Personalize campaign {campaign_arn} deletion initiated (reason unspecified)' 73 | 74 | put_event( 75 | detail_type = 'PersonalizeCampaignDeleted', 76 | detail = json.dumps({ 77 | 'CampaignARN': campaign_arn, 78 | 'Reason': reason 79 | }), 80 | resources = [ campaign_arn ] 81 | ) 82 | 83 | put_event( 84 | detail_type = 'BuildPersonalizeMonitorDashboard', 85 | detail = json.dumps({ 86 | 'CampaignARN': campaign_arn, 87 | 'Reason': reason 88 | }), 89 | resources = [ campaign_arn ] 90 | ) 91 | 92 | logger.info({ 93 | 'campaignArn': campaign_arn 94 | }) 95 | 96 | delete_alarms_for_campaign(campaign_arn) 97 | 98 | return f'Successfully initiated delete of campaign {campaign_arn}' -------------------------------------------------------------------------------- /src/dashboard_mgmt_function/dashboard-template.mustache: -------------------------------------------------------------------------------- 1 | { 2 | "widgets": [{ 3 | "type": "metric", 4 | "width": 4, 5 | "height": 4, 6 | "properties": { 7 | "metrics": [ 8 | ["{{namespace}}", "monitoredCampaignCount"] 9 | ], 10 | "view": "singleValue", 11 | "region": "{{current_region}}", 12 | "title": "Campaigns Monitored", 13 | "stat": "Average", 14 | "period": 300 15 | } 16 | }, 17 | { 18 | "type": "text", 19 | "width": 20, 20 | "height": 4, 21 | "properties": { 22 | "markdown": "\n## Amazon Personalize Monitor Dashboard\n*This dashboard and its widgets were created and managed via the [Personalize Monitor](https://github.com/aws-samples/amazon-personalize-monitor) application.*\n\nFor best practices on integrating with and operating [Amazon Personalize](https://aws.amazon.com/personalize/), please see our [Cheat Sheet](https://github.com/aws-samples/amazon-personalize-samples/blob/master/PersonalizeCheatSheet2.0.md).\n\nResources: [Service Documentation](https://docs.aws.amazon.com/personalize/latest/dg/what-is-personalize.html) | [Personalize Blog](https://aws.amazon.com/blogs/machine-learning/category/artificial-intelligence/amazon-personalize/) | [Samples on GitHub](https://github.com/aws-samples/amazon-personalize-samples)\n" 23 | } 24 | } 25 | {{#dataset_groups}} 26 | ,{ 27 | "type": "text", 28 | "width": 24, 29 | "height": 1, 30 | "properties": { 31 | "markdown": "\n### Dataset Group: **{{name}}** ({{region}}) | [Manage](https://console.aws.amazon.com/personalize/home?region={{region}}#arn:aws:personalize:{{region}}:{{account_id}}:dataset-group${{name}}/campaigns)\n" 32 | } 33 | }, 34 | { 35 | "type": "metric", 36 | "width": 8, 37 | "height": 8, 38 | "properties": { 39 | "metrics": [ 40 | {{#campaigns}} 41 | ["{{namespace}}", "minProvisionedTPS", "CampaignArn", "{{campaign_arn}}", { 42 | "label": "{{name}} minProvisionedTPS" 43 | }], 44 | ["{{namespace}}", "averageTPS", "CampaignArn", "{{campaign_arn}}", { 45 | "label": "{{name}} averageTPS" 46 | }]{{^last_campaign}}, {{/last_campaign}} 47 | {{/campaigns}} 48 | ], 49 | "region": "{{region}}", 50 | "view": "timeSeries", 51 | "stacked": false, 52 | "stat": "Average", 53 | "period": 300, 54 | "title": "Actual vs Provisioned TPS", 55 | "yAxis": { 56 | "left": { 57 | "label": "TPS", 58 | "min": 0, 59 | "showUnits": false 60 | }, 61 | "right": { 62 | "showUnits": true, 63 | "label": "" 64 | } 65 | }, 66 | "annotations": { 67 | "horizontal": [{ 68 | "label": "Lowest TPS Allowed", 69 | "value": 1 70 | }] 71 | } 72 | } 73 | }, 74 | { 75 | "type": "metric", 76 | "width": 8, 77 | "height": 8, 78 | "properties": { 79 | "view": "timeSeries", 80 | "stacked": false, 81 | "metrics": [ 82 | {{#campaigns}} 83 | ["{{namespace}}", "campaignUtilization", "CampaignArn", "{{campaign_arn}}", { 84 | "label": "{{name}} campaignUtilization" 85 | }]{{^last_campaign}}, {{/last_campaign}} 86 | {{/campaigns}} 87 | ], 88 | "region": "{{region}}", 89 | "title": "Campaign Utilization" 90 | } 91 | }, 92 | { 93 | "type": "metric", 94 | "width": 8, 95 | "height": 8, 96 | "properties": { 97 | "view": "timeSeries", 98 | "stacked": false, 99 | "metrics": [ 100 | {{#campaigns}} 101 | ["AWS/Personalize", "{{campaign_latency_metric_name}}", "CampaignArn", "{{campaign_arn}}", { 102 | "label": "{{name}} {{campaign_latency_metric_name}}" 103 | }]{{^last_campaign}}, {{/last_campaign}} 104 | {{/campaigns}} 105 | ], 106 | "region": "{{region}}", 107 | "title": "Campaign Latency" 108 | } 109 | } 110 | {{/dataset_groups}} 111 | ] 112 | } -------------------------------------------------------------------------------- /src/personalize_delete_campaign_function/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - Delete Campaign Function 2 | 3 | This Lambda function deletes a Personalize campaign. It is called as the target of an EventBridge rule that matches events with the `DeletePersonalizeCampaign` detail-type. The [personalize-monitor](../personalize_monitor_function/) function publishes this event when the `AutoDeleteIdleCampaigns` deployment parameter is `Yes` AND a monitored campaign has been idle more than `IdleCampaignThresholdHours` hours. Therefore, an idle campaign is one that has not had any `GetRecommendations` or `GetPersonalizedRanking` calls in the last `IdleCampaignThresholdHours` hours. 4 | 5 | This function will also delete any CloudWatch alarms that were dynamically created by this application for the deleted campaign. Alarms can be created for idle campaigns and low utilization campaigns via the `AutoCreateIdleCampaignAlarms` and `AutoCreateCampaignUtilizationAlarms` deployment parameters. 6 | 7 | ## How it works 8 | 9 | The EventBridge event structure that triggers this function looks something like this: 10 | 11 | ```javascript 12 | { 13 | "source": "personalize.monitor", 14 | "detail-type": "DeletePersonalizeCampaign", 15 | "resources": [ CAMPAIGN_ARN_TO_DELETE ], 16 | "detail": { 17 | 'CampaignARN': CAMPAIGN_ARN_TO_DELETE, 18 | 'CampaignUtilization': CURRENT_UTILIZATION, 19 | 'CampaignAgeHours': CAMPAIGN_AGE_IN_HOURS, 20 | 'IdleCampaignThresholdHours': CAMPAIGN_IDLE_HOURS, 21 | 'TotalRequestsDuringIdleThresholdHours': 0, 22 | 'Reason': DESCRIPTIVE_REASON_FOR_DELETE 23 | } 24 | } 25 | ``` 26 | 27 | This function can also be invoked directly as part of your own operational process. The event you pass to the function only requires the campaign ARN as follows. 28 | 29 | ```javascript 30 | { 31 | "CampaignARN": CAMPAIGN_ARN_TO_DELETE, 32 | "Reason": OPTIONAL_DESCRIPTIVE_REASON_FOR_DELETE 33 | } 34 | ``` 35 | 36 | The Personalize [DeleteCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_DeleteCampaign.html) API is used to delete the campaign. 37 | 38 | ## Published events 39 | 40 | When the deletion of a campaign and any dynamically created CloudWatch alarms for the campaign have been successfully initiated by this function, two events are published to EventBridge. One event will trigger a notification to the SNS topic for this application and the other trigger the CloudWatch dashboard to be rebuilt. 41 | 42 | ### Delete notification 43 | 44 | The following event is published to EventBridge to signal that a campaign has been deleted. 45 | 46 | ```javascript 47 | { 48 | "source": "personalize.monitor", 49 | "detail_type": "PersonalizeCampaignDeleted", 50 | "resources": [ CAMPAIGN_ARN_DELETED ], 51 | "detail": { 52 | "CampaignARN": CAMPAIGN_ARN_DELETED, 53 | "Reason": DESCRIPTIVE_REASON_FOR_DELETE 54 | } 55 | } 56 | ``` 57 | 58 | An EventBridge rule is setup that will target an SNS topic with `NotificationEndpoint` as the subscriber. This is the email address you provided at deployment time. If you'd like, you can customize how these notification events are handled in the EventBridge and SNS consoles. 59 | 60 | ### Rebuild CloudWatch dashboard 61 | 62 | Since a monitored campaign has been deleted, the CloudWatch dashboard needs to be rebuilt so that the campaign is removed from the widgets. This is accomplished by publishing a `BuildPersonalizeMonitorDashboard` event that is processed by the [dashboard_mgmt](../dashboard_mgmt_function/) function. 63 | 64 | ```javascript 65 | { 66 | "source": "personalize.monitor", 67 | "detail_type": "BuildPersonalizeMonitorDashboard", 68 | "resources": [ CAMPAIGN_ARN_DELETED ], 69 | "detail": { 70 | "CampaignARN": CAMPAIGN_ARN_DELETED, 71 | "Reason": DESCRIPTIVE_REASON_FOR_REBUILD 72 | } 73 | } 74 | ``` 75 | -------------------------------------------------------------------------------- /src/personalize_update_campaign_tps_function/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - Campaign Provisioned TPS Update Function 2 | 3 | This Lambda function adjusts the `minProvisionedTPS` value for a Personalize campaign. It is called as the target of EventBridge rules for events emitted by the [personalize_monitor](../personalize_monitor_function/) function when configured to update campaigns based on actual TPS activity. You can also incorporate this function into your own operations to scale campaigns up and down. For example, if you know your campaign will experience a massive spike in requests at a certain time (i.e. flash sale) and you want to pre-warm your campaign capacity, you can create a [CloudWatch event](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/RunLambdaSchedule.html) to call this function 30 minutes before the expected spike in traffic to increase the `minProvisionedTPS` and then again after the traffic event to lower the `minProvisionedTPS`. Alternatively, if there are certain events that occur in your application that you know will generate a predictably higher or lower volume of requests than the current `minProvisionedTPS` **AND** Personalize's auto-scaling will not suffice, you can use this function as a trigger to adjust `minProvisionedTPS` accordingly. 4 | 5 | ## How it works 6 | 7 | The EventBridge event structure that triggers this function looks something like this: 8 | 9 | ```javascript 10 | { 11 | "source": "personalize.monitor", 12 | "detail-type": "UpdatePersonalizeCampaignMinProvisionedTPS", 13 | "resources": [ CAMPAIGN_ARN_TO_UPDATE ], 14 | "detail": { 15 | "CampaignARN": CAMPAIGN_ARN_TO_UPDATE, 16 | "CampaignUtilization": CURRENT_UTILIZATION, 17 | "CampaignAgeHours": CAMPAIGN_AGE_IN_HOURS, 18 | "CurrentProvisionedTPS": CURRENT_MIN_PROVISIONED_TPS, 19 | "MinProvisionedTPS": NEW_MIN_PROVISIONED_TPS, 20 | "MinAverageTPS": MIN_AVERAGE_TPS_LAST_24_HOURS, 21 | "MaxAverageTPS": MAX_AVERATE_TPS_LAST_24_HOURS, 22 | "Datapoints": [ CW_METRIC_DATAPOINTS_LAST_24_HOURS ], 23 | "Reason": DESCRIPTIVE_REASON_FOR_UPDATE 24 | } 25 | } 26 | ``` 27 | 28 | This function can also be invoked directly as part of your own operational process. The event you pass to the function only requires the campaign ARN and new `minProvisionedTPS` as follows. 29 | 30 | ```javascript 31 | { 32 | "CampaignARN": "CAMPAIGN_ARN_HERE", 33 | "MinProvisionedTPS": NEW_MIN_PROVISIONED_TPS_HERE, 34 | "Reason": DESCRIPTIVE_REASON_FOR_UPDATE 35 | } 36 | ``` 37 | 38 | The Personalize [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API is used to update the `minProvisionedTPS` value. 39 | 40 | ## Published events 41 | 42 | When an update of a campaign's `minProvisionedTPS` has been successfully initiated by this function, an event is published to EventBridge to trigger a notification. 43 | 44 | > Since it can take several minutes for a campaign to redeploy after updating its `minProvisionedTPS`, you will receive the notification when the redeploy starts. The campaign will continue to respond to `GetRecommendations`/`GetPersonalizedRanking` API requests while it is redeploying. **Therefore, there will be no interruption of service.** 45 | 46 | ### Update minProvisionedTPS notification 47 | 48 | The following event is published to EventBridge to signal that an update to a campaign has been initiated. 49 | 50 | ```javascript 51 | { 52 | "source": "personalize.monitor", 53 | "detail_type": "PersonalizeCampaignMinProvisionedTPSUpdated", 54 | "resources": [ CAMPAIGN_ARN_UPDATED ], 55 | "detail": { 56 | "CampaignARN": CAMPAIGN_ARN_UPDATED, 57 | "NewMinProvisionedTPS": NEW_TPS, 58 | "Reason": DESCRIPTIVE_REASON_FOR_DELETE 59 | } 60 | } 61 | ``` 62 | 63 | An EventBridge rule is setup that will target an SNS topic with `NotificationEndpoint` as the subscriber. This is the email address you provided at deployment time. If you'd like, you can customize how these notification events are handled or add your own targets in the EventBridge and SNS consoles. 64 | -------------------------------------------------------------------------------- /src/dashboard_mgmt_function/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - CloudWatch Dashboard Create/Update/Delete Function 2 | 3 | The [dashboard_mgmt.py](./dashboard_mgmt.py) Lambda function is responsible for creating, updating/refreshing, and deleting the [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) for this application. It is called in the following contexts: 4 | 5 | - As part of the CloudFormation deployment process for this application as a [custom resource](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/template-custom-resources.html) (create, update, delete). 6 | - In response to the `BuildPersonalizeMonitorDashboard` CloudWatch event being handled. This event is published to the default [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html) event bus when a monitored campaign is automatically deleted so that the dashboard can be rebuilt. An EventBridge rule is used to trigger this function to be invoked when the event is received. 7 | - At the top of every hour, triggered by a scheduled CloudWatch event. This ensures that any campaigns that are created or deleted (outside of this application) that meet the monitoring criteria are added to the dashboard. 8 | 9 | The dashboard will include line graph widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the Personalize campaigns you wish to monitor. Here is an example of a dashboard. 10 | 11 | ![Personalize Monitor CloudWatch Dashboard](../../images/personalize-monitor-cloudwatch-dashboard.png) 12 | 13 | ## How it works 14 | 15 | The EventBridge event structure that triggers this function looks something like this: 16 | 17 | ```javascript 18 | { 19 | "source": "personalize.monitor", 20 | "detail-type": "BuildPersonalizeMonitorDashboard", 21 | "resources": [ CAMPAIGN_ARN_THAT_TRIGGERED ], 22 | "detail": { 23 | "Reason": DESCRIPTIVE_REASON_FOR_UPDATE 24 | } 25 | } 26 | ``` 27 | 28 | This function can also be invoked directly as part of your own operational process. The `Reason` is optional and just used for logging. 29 | 30 | ```javascript 31 | { 32 | "Reason": DESCRIPTIVE_REASON_FOR_UPDATE 33 | } 34 | ``` 35 | 36 | ### Create/Update 37 | 38 | When called as part of this application's create or update deployment process or as a result of the `BuildPersonalizeMonitorDashboard`, the function first determines what Personalize campaigns should be monitored based on the CloudFormation template parameters you specify when you [installed](../README.md#installing-the-application) the application. The monitored campaigns are grouped by [dataset group](https://docs.aws.amazon.com/personalize/latest/dg/data-prep-ds-group.html) and placed in a dictionary that is passed to the python [chevron](https://github.com/noahmorrison/chevron) library to render the [dashboard template](./dashboard-template.mustache) file. The template uses the [mustache templating language](http://mustache.github.io/) to build the widgets. 39 | 40 | Once the template is rendered as dashboard source (JSON), the dashboard source is used to create or update the CloudWatch dashboard by calling the [PutDashboard API](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutDashboard.html). 41 | 42 | Therefore, if you want to change what campaigns are monitored, just re-deploy this application and the current dashboard will be overwritten with your campaign changes or wait for the dashboard to automatically update itself (subject to campaign monitoring configuration). **This also means that any manual changes you make to the Personalize Monitor dashboard will be lost.** If you want to add your own widgets to the dashboard or change the existing widgets, you can fork this repository, change the [dashboard-template.mustache](./dashboard-template.mustache) template file, and deploy into your AWS account. 43 | 44 | ### Delete 45 | 46 | When the CloudFormation stack is deleted for this application, this function will delete the dashboard. 47 | 48 | ## Calling from your own code 49 | 50 | You can trigger the CloudWatch dashboard to be rebuilt by publishing the `BuildPersonalizeMonitorDashboard` detail-type from own code. Here is an example in python. 51 | 52 | ```python 53 | import boto3 54 | import json 55 | 56 | event_bridge = boto3.client('events') 57 | 58 | event_bridge.put_events( 59 | Entries=[ 60 | { 61 | 'Source': 'personalize.monitor', 62 | 'DetailType': 'BuildPersonalizeMonitorDashboard', 63 | 'Detail': json.dumps({ 64 | 'Reason': 'Rebuild the dashboard because I said so' 65 | }) 66 | } 67 | ] 68 | ) 69 | ``` -------------------------------------------------------------------------------- /src/dashboard_mgmt_function/dashboard_mgmt.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """Manages create/update/delete of the Personalize Monitor CloudWatch dashboard 5 | 6 | This function is called two ways: 7 | 8 | 1. From CloudFormation when the application is deployed, updated, or deleted in an AWS 9 | account. When the resource is created, this function will create the Personalize 10 | Monitor Dashboard in CloudWatch populated with widgets for monitoring Personalize 11 | resources configured as deployment parameters. 12 | 13 | When this resource is updated (i.e. redeployed), the dashboard will be rebuilt and 14 | updated/replaced. 15 | 16 | When this resource is deleted, this function will delete the CloudWatch Dashboard. 17 | 18 | 2. As the target of an EventBridge rule that signals that the dashboard should be 19 | rebuilt as a result of an event occurring. The event could be after a campaign has 20 | been deleted and therefore a good point to rebuild the dashboard. It could also 21 | be setup to periodically rebuild the dashboard on a schedule so it picks up new 22 | campaigns too. 23 | 24 | See the layer_dashboard Lambda Laye for details on how the dashboard is built. 25 | """ 26 | 27 | import json 28 | import os 29 | import boto3 30 | import chevron 31 | 32 | from crhelper import CfnResource 33 | from aws_lambda_powertools import Logger 34 | from common import ( 35 | extract_region, 36 | extract_account_id, 37 | get_client, 38 | get_configured_active_campaigns 39 | ) 40 | 41 | logger = Logger() 42 | helper = CfnResource() 43 | 44 | cloudwatch = boto3.client('cloudwatch') 45 | 46 | DASHBOARD_NAME = 'Personalize-Monitor' 47 | 48 | def build_dashboard(event): 49 | # Will hold the data used to render the template. 50 | template_data = {} 51 | 52 | template_data['namespace'] = 'PersonalizeMonitor' 53 | template_data['current_region'] = os.environ['AWS_REGION'] 54 | 55 | logger.debug('Loading active campaigns') 56 | 57 | campaigns = get_configured_active_campaigns(event) 58 | template_data['active_campaign_count'] = len(campaigns) 59 | 60 | # Group campaigns by dataset group so we can create DSG specific widgets in rows 61 | campaigns_by_dsg_arn = {} 62 | # Holds DSG info so we only have describe once per DSG 63 | dsgs_by_arn = {} 64 | 65 | for campaign in campaigns: 66 | logger.info('Campaign %s will be added to the dashboard', campaign['campaignArn']) 67 | 68 | campaign_region = extract_region(campaign['campaignArn']) 69 | 70 | personalize = get_client('personalize', campaign_region) 71 | 72 | response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn']) 73 | 74 | dsg_arn = response['solutionVersion']['datasetGroupArn'] 75 | recipe_arn = response['solutionVersion']['recipeArn'] 76 | 77 | dsg = dsgs_by_arn.get(dsg_arn) 78 | if not dsg: 79 | response = personalize.describe_dataset_group(datasetGroupArn = dsg_arn) 80 | dsg = response['datasetGroup'] 81 | dsgs_by_arn[dsg_arn] = dsg 82 | 83 | campaign_datas = campaigns_by_dsg_arn.get(dsg_arn) 84 | if not campaign_datas: 85 | campaign_datas = [] 86 | campaigns_by_dsg_arn[dsg_arn] = campaign_datas 87 | 88 | campaign_data = { 89 | 'name': campaign['name'], 90 | 'campaign_arn': campaign['campaignArn'], 91 | 'region': campaign_region 92 | } 93 | 94 | if recipe_arn == 'arn:aws:personalize:::recipe/aws-personalized-ranking': 95 | campaign_data['campaign_latency_metric_name'] = 'GetPersonalizedRankingLatency' 96 | else: 97 | campaign_data['campaign_latency_metric_name'] = 'GetRecommendationsLatency' 98 | 99 | campaign_datas.append(campaign_data) 100 | 101 | dsgs_for_template = [] 102 | 103 | for dsg_arn, campaign_datas in campaigns_by_dsg_arn.items(): 104 | dsg = dsgs_by_arn[dsg_arn] 105 | 106 | # Minor hack to know when we're on the last item in list when iterating in template. 107 | campaign_datas[len(campaign_datas) - 1]['last_campaign'] = True 108 | 109 | dsgs_for_template.append({ 110 | 'name': dsg['name'], 111 | 'region': extract_region(dsg_arn), 112 | 'account_id': extract_account_id(dsg_arn), 113 | 'campaigns': campaign_datas 114 | }) 115 | 116 | template_data['dataset_groups'] = dsgs_for_template 117 | 118 | # Render template and use as dashboard body. 119 | with open('dashboard-template.mustache', 'r') as f: 120 | dashboard = chevron.render(f, template_data) 121 | 122 | logger.debug(json.dumps(dashboard, indent = 2, default = str)) 123 | 124 | logger.info('Adding/updating dashboard') 125 | 126 | cloudwatch.put_dashboard( 127 | DashboardName = DASHBOARD_NAME, 128 | DashboardBody = dashboard 129 | ) 130 | 131 | def delete_dashboard(): 132 | logger.info('Deleting dashboard') 133 | 134 | cloudwatch.delete_dashboards( 135 | DashboardNames = [ DASHBOARD_NAME ] 136 | ) 137 | 138 | @helper.create 139 | @helper.update 140 | def create_or_update_resource(event, _): 141 | build_dashboard(event) 142 | 143 | @helper.delete 144 | def delete_resource(event, _): 145 | delete_dashboard() 146 | 147 | @logger.inject_lambda_context(log_event=True) 148 | def lambda_handler(event, context): 149 | # If the event has a RequestType, we're being called by CFN as custom resource 150 | if event.get('RequestType'): 151 | logger.info('Called via CloudFormation as a custom resource; letting CfnResource route request') 152 | helper(event, context) 153 | else: 154 | logger.info('Called via Invoke; assuming caller wants to build dashboard') 155 | 156 | if event.get('detail'): 157 | reason = event['detail'].get('Reason') 158 | else: 159 | reason = event.get('Reason') 160 | 161 | if reason: 162 | logger.info('Reason for dashboard build: %s', reason) 163 | 164 | build_dashboard(event) -------------------------------------------------------------------------------- /src/personalize_monitor_function/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor - Core Monitor Function 2 | 3 | The [personalize_monitor.py](./personalize_monitor.py) Lambda is called every 5 minutes by a CloudWatch scheduled event rule to generate the CloudWatch metrics needed to populate the Personalize Monitor dashboard line graph widgets and to trigger the CloudWatch alarms for low campaign utilization and idle campaign detection (if configured). Also, if the `AutoDeleteIdleCampaigns` deployment parameter is `Yes` AND a monitored campaign has been idle more than `IdleCampaignThresholdHours` hours, this function will publish a `DeletePersonalizeCampaign` event to EventBridge that is handled by the [personalize_delete_campaign](../personalize_delete_campaign_function/) function. An idle campaign is one that has not had any `GetRecommendations` or `GetPersonalizedRanking` calls in the last `IdleCampaignThresholdHours` hours. Finally, this function will adjust a campaign's `minProvisionedTPS` (down only) if the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter is `Yes`. 4 | 5 | ## How it works 6 | 7 | The function first determines what Personalize campaigns should be monitored based on the CloudFormation template parameters you specify when you [install](../README.md#installing-the-application) the application. 8 | 9 | ## CloudWatch Metrics 10 | 11 | The following custom CloudWatch metrics are generated by this function on 5 minute intervals. You can find these metrics in the AWS console under CloudWatch and then Metrics or you can query them using the CloudWatch API. 12 | 13 | | Namespace | MetricName | Dimensions | Unit | Description | 14 | | --- | --- | --- | --- | --- | 15 | | PersonalizeMonitor | monitoredCampaignCount | | Count | Number of campaigns currently being monitored at interval | 16 | | PersonalizeMonitor | minProvisionedTPS | CampaignArn | Count/Second | `minProvisionedTPS` value for the campaign at interval | 17 | | PersonalizeMonitor | averageTPS | CampaignArn | Count/Second | Average TPS for the campaign at interval | 18 | | PersonalizeMonitor | campaignUtilization | CampaignArn | Percent | Utilization percentage of `averageTPS` vs `minProvisionedTPS` at interval | 19 | 20 | ### How is averageTPS calculated? 21 | 22 | The `averageTPS` metric value for each monitored campaign is calculated by first determining the number of requests made to the campaign during the 5 minute interval and dividing by 300 (the number of seconds in 5 minutes). The number of requests is pulled from the `GetRecommendations` or `GetPersonalizedRanking` metric (depending on the recipe for the campaign's solution) for the campaign from the `AWS/Personalize` namespace. This metric is automatically updated by Personalize itself. 23 | 24 | ## CloudWatch Alarms (optional) 25 | 26 | You can optionally have CloudWatch alarms dynamically created for monitored campaigns for low campaign utilization and idle campaigns. 27 | 28 | ### Low Campaign Utilization Alarm 29 | 30 | If you set the `AutoCreateCampaignUtilizationAlarms` CloudFormation template parameter to `Yes` when you installed this application, this function will automatically create a CloudWatch alarm for every campaign that it monitors. The alarm will trigger when the `campaignUtilization` custom metric described above drops below the `CampaignThresholdAlarmLowerBound` installation parameter for 9 out of 12 evaluation periods. Since the intervals are 5 minutes, that means that 9 of the 12 five minute evaluations over a 60 minute span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. The alarm will be created in the region where the campaign was created. An [SNS](https://aws.amazon.com/sns/) topic created by this application will be used as the alarm and ok actions and the `NotificationEndpoint` (email address) deployment parameter will be setup as a subscriber to the topic. **Be sure to confirm the subscription sent when this application is deployed by clicking on the one-time confirmation email sent by SNS.** 31 | 32 | The alarm will have its actions disabled when the `minProvisionedTPS` is 1 and enabled with `minProvisionedTPS` is > 1 so that notifications are only sent when utilization can be impacted by adjusting `minProvisionedTPS`. 33 | 34 | ### Idle Campaign Alarm 35 | 36 | If you set the `AutoCreateIdleCampaignAlarms` CloudFormation template parameter to `Yes` when you installed this application, this function will automatically create a CloudWatch alarm for every monitored campaign that is idle for at least `IdleCampaignThresholdHours` hours. The actions for the alarm will be enabled only after the campaign has existed for `IdleCampaignThresholdHours` as well. The `GetRecommendations` or `GetPersonalizedRanking` (depending on the campaign's recipe) will be used to assess the campaign's idle state. The alarm will be created in the region where the campaign was created. An [SNS](https://aws.amazon.com/sns/) topic created by this application will be used as the alarm and ok actions and the `NotificationEndpoint` (email address) deployment parameter will be setup as a subscriber to the topic. **Be sure to confirm the subscription sent when this application is deployed by clicking on the one-time confirmation email sent by SNS.** 37 | 38 | ## Automatically adjusting campaign minProvisionedTPS (optional) 39 | 40 | If the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter is `Yes`, this function will check the actual hourly TPS over the last 14 days against the currently configured `minProvisionedTPS` and look for opportunities to reduce the campaign's `minProvisionedTPS` to optimize utilization and reduce costs. It does this by checking the campaign's request volume for the previous 14 days on hourly intervals and finding the hour with the lowest average TPS (low watermark). If the low watermark average is less than the campaign's `minProvisionedTPS` AND the campaign is more than 1 day old, it will drop the `minProvisionedTPS` by 25%. This process will be repeated each hour until either the `minProvisionedTPS` meets the low watermark TPS or the `minProvisionedTPS` reaches 1 (the lowest allowed value). **This function will NOT increase the `minProvisionedTPS` for a campaign.** Instead it will rely on Personalize to auto-scale campaigns up and back down to `minProvisionedTPS` to meet demand. 41 | 42 | > Since it can take several minutes for a campaign to redeploy after updating its `minProvisionedTPS`, you will receive the notification when the redeploy starts. The campaign will continue to respond to `GetRecommendations`/`GetPersonalizedRanking` API requests while it is redeploying. There will be no interruption of service. 43 | 44 | See the [personalize_update_campaign_tps](../personalize_update_campaign_tps_function/) function for details on the update function. 45 | 46 | ## Automatically deleting idle campaigns (optional) 47 | 48 | If the `AutoDeleteIdleCampaigns` deployment parameter is `Yes`, this function will perform additional checks once per hour for each monitored campaign to see if it has been idle for more than `IdleCampaignThresholdHours` hours. The purpose of this feature is to prevent abandoned campaigns from continuing to incur costs when they are no longer being used. Campaign checks are distributed across each hour in 10 minute blocks in an attempt to spread out the API calls needed to check and update campaigns. 49 | 50 | To avoid too aggressively deleting campaigns, new campaigns that are not more than `IdleCampaignThresholdHours` hours old are exempt from being deleted. Similarly, if a campaign has been updated within `IdleCampaignThresholdHours`, it will also be exempt from being automatically deleted. The idea is that new or actively updated campaigns are likely not safe to delete. 51 | -------------------------------------------------------------------------------- /src/layer/common.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """ 5 | Lambda layer functions shared across Lambda functions in this application 6 | """ 7 | 8 | import boto3 9 | import os 10 | import random 11 | 12 | from botocore.exceptions import ClientError 13 | from aws_lambda_powertools import Logger 14 | from expiring_dict import ExpiringDict 15 | 16 | logger = Logger(child=True) 17 | 18 | _clients_by_region = {} 19 | # Since the DescribeCampaign API easily throttles and we just need 20 | # the minProvisionedTPS from the campaign, use a cache to help smooth 21 | # out periods where we get throttled. 22 | _campaign_cache = ExpiringDict(22 * 60) 23 | 24 | PROJECT_NAME = 'PersonalizeMonitor' 25 | ALARM_NAME_PREFIX = PROJECT_NAME + '-' 26 | 27 | def put_event(detail_type, detail, resources = []): 28 | event_bridge = get_client('events') 29 | 30 | logger.info({ 31 | 'detail_type': detail_type, 32 | 'detail': detail, 33 | 'resources': resources 34 | }) 35 | 36 | event_bridge.put_events( 37 | Entries=[ 38 | { 39 | 'Source': 'personalize.monitor', 40 | 'Resources': resources, 41 | 'DetailType': detail_type, 42 | 'Detail': detail 43 | } 44 | ] 45 | ) 46 | 47 | def extract_region(arn): 48 | ''' Extracts region from an AWS ARN ''' 49 | region = None 50 | elements = arn.split(':') 51 | if len(elements) > 3: 52 | region = elements[3] 53 | 54 | return region 55 | 56 | def extract_account_id(arn): 57 | ''' Extracts account ID from an AWS ARN ''' 58 | account_id = None 59 | elements = arn.split(':') 60 | if len(elements) > 4: 61 | account_id = elements[4] 62 | 63 | return account_id 64 | 65 | def get_client(service_name, region_name = None): 66 | if not region_name: 67 | region_name = os.environ['AWS_REGION'] 68 | 69 | ''' Returns boto3 client for a service and region ''' 70 | clients_by_service = _clients_by_region.get(region_name) 71 | 72 | if not clients_by_service: 73 | clients_by_service = {} 74 | _clients_by_region[region_name] = clients_by_service 75 | 76 | client = clients_by_service.get(service_name) 77 | 78 | if not client: 79 | client = boto3.client(service_name = service_name, region_name = region_name) 80 | clients_by_service[service_name] = client 81 | 82 | return client 83 | 84 | def determine_regions(event): 85 | ''' Determines regions from function event or environment ''' 86 | # Check event first (list of region names) 87 | regions = None 88 | if event: 89 | regions = event.get('Regions') 90 | 91 | if not regions: 92 | # Check environment variable next for list of region names as CSV 93 | regions = os.environ.get('Regions') 94 | 95 | if not regions: 96 | # Lastly, use current region from environment. 97 | regions = os.environ['AWS_REGION'] 98 | 99 | if regions and isinstance(regions, str): 100 | regions = [exp.strip(' ') for exp in regions.split(',')] 101 | 102 | return regions 103 | 104 | def determine_campaign_arns(event): 105 | ''' Determines Personalize campaign ARNs based on function event or environment ''' 106 | 107 | # Check event first (list of campaign ARNs) 108 | arns = None 109 | if event: 110 | arns = event.get('CampaignARNs') 111 | 112 | if not arns: 113 | # Check environment variable next for list of campaign ARNs as CSV 114 | arns = os.environ.get('CampaignARNs') 115 | 116 | if not arns: 117 | raise Exception('"CampaignARNs" expression required in event or environment') 118 | 119 | if isinstance(arns, str): 120 | arns = [exp.strip(' ') for exp in arns.split(',')] 121 | 122 | logger.debug('CampaignARNs expression: %s', arns) 123 | 124 | # Look for magic value of "all" to mean all active campaigns in configured region(s) 125 | if len(arns) == 1 and arns[0].lower() == 'all': 126 | logger.debug('Retrieving ARNs for all active campaigns') 127 | campaign_arns = [] 128 | 129 | # Determine regions we need to consider 130 | regions = determine_regions(event) 131 | logger.debug('Regions to scan for active campaigns: %s', regions) 132 | 133 | for region in regions: 134 | personalize = get_client(service_name = 'personalize', region_name = region) 135 | 136 | campaigns_for_region = 0 137 | 138 | campaigns_paginator = personalize.get_paginator('list_campaigns') 139 | for campaigns_page in campaigns_paginator.paginate(): 140 | for campaign in campaigns_page['campaigns']: 141 | campaign_arns.append(campaign['campaignArn']) 142 | campaigns_for_region += 1 143 | 144 | logger.debug('Region %s has %d campaigns', region, campaigns_for_region) 145 | else: 146 | campaign_arns = arns 147 | 148 | return campaign_arns 149 | 150 | def get_configured_active_campaigns(event): 151 | ''' Returns list of active campaigns as configured by function event and/or environment ''' 152 | campaign_arns = determine_campaign_arns(event) 153 | 154 | # Shuffle the list of arns so we don't try to describe campaigns in the same order each 155 | # time and potentially use cached campaign details for the same campaigns further down 156 | # the list due to rare but possible API throttling. 157 | random.shuffle(campaign_arns) 158 | 159 | campaigns = [] 160 | 161 | for campaign_arn in campaign_arns: 162 | campaign_region = extract_region(campaign_arn) 163 | personalize = get_client(service_name = 'personalize', region_name = campaign_region) 164 | campaign = None 165 | 166 | try: 167 | # Always try the DescribeCampaign API directly first. 168 | campaign = personalize.describe_campaign(campaignArn = campaign_arn)['campaign'] 169 | _campaign_cache[campaign_arn] = campaign 170 | except ClientError as e: 171 | error_code = e.response['Error']['Code'] 172 | if error_code == 'ThrottlingException': 173 | logger.error('ThrottlingException trapped when calling DescribeCampaign API for %s', campaign_arn) 174 | 175 | # Fallback to see if we have a cached Campaign to use instead. 176 | campaign = _campaign_cache.get(campaign_arn) 177 | if campaign: 178 | logger.warn('Using cached campaign object for %s', campaign_arn) 179 | else: 180 | logger.warn('Campaign %s NOT found found in cache; skipping this time', campaign_arn) 181 | elif error_code == 'ResourceNotFoundException': 182 | # Campaign has been deleted; log and skip. 183 | logger.error('Campaign %s no longer exists; skipping', campaign_arn) 184 | else: 185 | raise e 186 | 187 | if campaign: 188 | if campaign['status'] == 'ACTIVE': 189 | latest_status = None 190 | if campaign.get('latestCampaignUpdate'): 191 | latest_status = campaign['latestCampaignUpdate']['status'] 192 | 193 | if not latest_status or (latest_status != 'DELETE PENDING' and latest_status != 'DELETE IN_PROGRESS'): 194 | campaigns.append(campaign) 195 | else: 196 | logger.info('Campaign %s latestCampaignUpdate.status is %s and cannot be monitored in this state; skipping', campaign_arn, latest_status) 197 | else: 198 | logger.info('Campaign %s status is %s and cannot be monitored in this state; skipping', campaign_arn, campaign['status']) 199 | 200 | return campaigns -------------------------------------------------------------------------------- /README-SAR.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor 2 | 3 | This project contains the source code and supporting files for deploying a serverless application that adds monitoring, alerting, and optimzation capabilities for [Amazon Personalize](https://aws.amazon.com/personalize/), an AI service from AWS that allows you to create custom ML recommenders based on your data. Highlights include: 4 | 5 | - Generation of additional [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) to track the Average TPS, `minProvisionedTPS`, and Utilization of Personalize [campaign](https://docs.aws.amazon.com/personalize/latest/dg/campaigns.html) over time. 6 | - [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) to alert you via SNS/email when campaign utilization drops below a configurable threshold (optional). 7 | - [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) populated with graph widgets for Actual vs Provisioned TPS, Campaign Utilization, Campaign Latency, and the number of campaigns being monitored. 8 | - Capable of monitoring campaigns across multiple regions in the same AWS account. 9 | - Automatically delete campaigns that have been idle more than a configurable number of hours (optional). 10 | - Automatically reduce the `minProvisionedTPS` for over-provisioned campaigns to optimize cost (optional). 11 | 12 | ## Why is this important? 13 | 14 | Once you create a solution and solution version based on your data, an Amazon Personalize campaign can be created that allows you to retrieve recommendations in real-time based on the solution version. This is typically how Personalize is integrated into your applications. When an application needs to display personalized recommendations to a user, a [GetRecommendations](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#recommendations) or [GetPersonalizedRanking](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#rankings) API call is made to a campaign to retrieve recommendations. Just like monitoring your own application components is important, monitoring your Personalize campaigns is also important and considered a best practice. This application is designed to help you do just that. 15 | 16 | When you provision a campaign using the [CreateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_CreateCampaign.html) API, you must specify a value for `minProvisionedTPS`. This value specifies the requested _minimum_ provisioned transactions (calls) per second that Amazon Personalize will support for that campaign. As your actual request volume to a campaign approaches its `minProvisionedTPS`, Personalize will automatically provision additional resources to meet your request volume. Then when request volume drops, Personalize will automatically scale back down **no lower** than `minProvisionedTPS`. **Since you are billed based on the higher of actual TPS and `minProvisionedTPS`, it is therefore important to not over-provision your campaigns to optimize cost.** This also means that leaving a campaign idle (active but no longer in-use) will result in unnecessary charges. This application gives you the tools to visualize your campaign utilization, to be notified when there is an opportunity to tune your campaign provisioning, and even take action to reduce and eliminate over-provisioning. 17 | 18 | > General best practice is to set `minProvisionedTPS` to `1`, or your low watermark for campaign recommendations requests, and let Personalize auto-scale campaign resources to meet actual demand. 19 | 20 | See the Amazon Personalize [pricing page](https://aws.amazon.com/personalize/pricing/) for full details on costs. 21 | 22 | ### CloudWatch Dashboard 23 | 24 | When you deploy this application, a [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) is built with widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the campaigns you wish to monitor. The dashboard gives you critical visual information to assess how your campaigns are performing and being utilized. The data in these graphs can help you properly tune your campaign's `minProvisionedTPS`. 25 | 26 | ![Personalize Monitor CloudWatch Dashboard](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-dashboard.png) 27 | 28 | For more details on the CloudWatch dashboard created and maintained by this application, see the [dashboard_mgmt](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/dashboard_mgmt_function/) function page. 29 | 30 | ### CloudWatch Alarms 31 | 32 | At deployment time, you can optionally have this application automatically create [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) that will alert you when a monitored campaign's utilization drops below a threshold you define for two out of three evaluation periods. Since the intervals are 5 minutes, that means that two of the three 5 minute evaluations over a 15 minute span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. The alarms will be setup to alert you via email through an SNS topic. Once the alarms are setup, you can alternatively link them to any operations and messaging tools you already use (i.e. Slack, PagerDuty, etc). 33 | 34 | ![Personalize Monitor CloudWatch Alarms](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-alarms.png) 35 | 36 | For more details on the CloudWatch alarms created by this application, see the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function page. 37 | 38 | ### CloudWatch Metrics 39 | 40 | To support the CloudWatch dashboard and alarms described above, a few new custom [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) are added for the monitored campaigns. These metrics are populated by the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) Lambda function that is setup to run every 5 minutes in your account. You can find these metrics in CloudWatch under Metrics in the "PersonalizeMonitor" namespace. 41 | 42 | ![Personalize Monitor CloudWatch Metrics](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-cloudwatch-metrics.png) 43 | 44 | For more details on the custom metrics created by this application, see the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function page. 45 | 46 | ### Cost optimization (optional) 47 | 48 | This application can be optionally configured to automatically perform cost optimization actions for your Amazon Personalize campaigns. 49 | 50 | #### Idle campaigns 51 | Idle campaigns are those that have been provisioned but are not receiving any `GetRecommendations`/`GetPersonalizedRanking` calls. Since costs are incurred while a campaign is active regardless of whether it receives any requests, detecting and eliminating these idle campaigns can be an important cost optimization activity. This can be particularly useful in non-production AWS accounts such as development and testing. See the `AutoDeleteIdleCampaigns` and `IdleCampaignThresholdHours` deployment parameters in the installation instructions below and the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function#automatically-deleting-idle-campaigns-optional) function for details. 52 | 53 | #### Over-provisioned campaigns 54 | 55 | Properly provisioning campaigns, as described earlier, is also an important cost optimization activity. This application can be configured to automatically reduce a campaign's `minProvisionedTPS` based on actual request volume. This will optimize a campaign's utilization when request volume is lower while relying on Personalize to auto-scale based on actual activity. See the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter below and the [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function#automatically-adjusting-campaign-minprovisionedtps-optional) function for details. 56 | 57 | ### Architecture 58 | 59 | The following diagram depicts how the Lambda functions in this application work together using an event-driven approach built on [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html). The [personalize_monitor](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_monitor_function/) function is invoked every five minutes to generate CloudWatch metric data based on the monitored campaigns and create campaign utilization alarms (if configured). It also generates events which are published to EventBridge that trigger activities such as optimizing a campaign's `minProvisionedTPS`, deleting idle campaigns, updating the Personalize Monitor CloudWatch dashboard, and sending notifications. This approach allows you to more easily integrate these functions into your own operations by sending your own events, say, to trigger the dashboard to be rebuilt after you create a campaign or register your own targets to events generated by this application. 60 | 61 | ![Personalize Monitor Architecture](https://raw.githubusercontent.com/aws-samples/amazon-personalize-monitor/master/images/personalize-monitor-architecture.png) 62 | 63 | See the readme pages for each function for details on the events that they produce and consume. 64 | 65 | ## Installing the application 66 | 67 | ***IMPORTANT NOTE:** Deploying this application in your AWS account will create and consume AWS resources, which will cost money. For example, the CloudWatch dashboard, the Lambda function that collects additional monitoring metrics is run every 5 minutes, CloudWatch alarms, logging, and so on. Therefore, if after installing this application you choose not to use it as part of your monitoring strategy, be sure to follow the Uninstall instructions in the next section to avoid ongoing charges and to clean up all data.* 68 | 69 | | Parameter | Description | Default | 70 | | --- | --- | --- | 71 | | CampaignARNs | Comma separated list of Personalize campaign ARNs to monitor or `all` to monitor all active campaigns. It is recommended to use `all` so that any new campaigns that are added after deployment will be automatically detected, monitored, and have alarms created (optional) | `all` | 72 | | Regions | Comma separated list of AWS regions to monitor campaigns. Only applicable when `all` is used for `CampaignARNs`. Leaving this value blank will default to the region where this application is deployed (i.e. `AWS Region` parameter above). | | 73 | | AutoCreateCampaignUtilizationAlarms | Whether to automatically create a utilization CloudWatch alarm for each monitored campaign. | `Yes` | 74 | | CampaignThresholdAlarmLowerBound | Minimum threshold value (in percent) to enter alarm state for campaign utilization. This value is only relevant if `AutoCreateAlarms` is `Yes`. | `100` | 75 | | AutoAdjustCampaignMinProvisionedTPS | Whether to automatically compare campaign request activity against the campaign's `minProvisionedTPS` to determine if `minProvisionedTPS` can be reduced to optimize utilization. | `Yes` | 76 | | AutoCreateIdleCampaignAlarms | Whether to automatically create a idle detection CloudWatch alarm for each monitored campaign. | `Yes` | 77 | | IdleCampaignThresholdHours | Number of hours that a campaign must be idle (i.e. no requests) before it is automatically deleted. `AutoDeleteIdleCampaigns` must be `Yes` for idle campaign deletion to occur. | `24` | 78 | | AutoDeleteIdleCampaigns | Whether to automatically delete idle campaigns. An idle campaign is one that has not had any requests in `IdleCampaignThresholdHours` hours. | `No` | 79 | | NotificationEndpoint | Email address to receive alarm and ok notifications and campaign delete and update events (optional). An [SNS](https://aws.amazon.com/sns/) topic is created and this email address will be added as a subscriber to that topic. You will receive a confirmation email for the SNS topic subscription so be sure to click the confirmation link in that email to ensure you receive notifications. | | 80 | 81 | ## Uninstalling the application 82 | 83 | To remove the resources created by this application in your AWS account, be sure to uninstall the application. 84 | 85 | ## FAQs 86 | 87 | ***Q: Can I use this application to determine my accumulated inference charges during the month?*** 88 | 89 | ***A:*** No! Although the `actualTPS` and `minProvisionedTPS` custom metrics generated by this application may be used to calculate an approximation of your accumulated inference charges, it should **never** be used as a substitute or proxy for actual Personalize inference costs. Always consult your AWS Billing Dashboard for actual service charges. 90 | 91 | ***Q: What is an ideal campaign utilization percentage? Is it okay if my campaign utilization is over 100%?*** 92 | 93 | ***A:*** The campaign utilization metric is a measure of your actual campaign usage compared against the `minProvisionedTPS` for the campaign. Any utilization value >= 100% is ideal since that means you are not over-provisioning, and therefore not over-paying, for campaign resources. You're letting Personalize handle the scaling in/out of the campaign. Anytime your utilization is below 100%, more resources are provisioned than are needed to satisfy the volume of requests at that time. 94 | 95 | ***Q: How can I tell if Personalize is scaling out fast enough?*** 96 | 97 | ***A:*** Compare the "Actual vs Provisioned TPS" graph to the "Campaign Latency" graph on the Personalize Monitor CloudWatch dashboard. When your Actual TPS increases/spikes for a campaign, does the latency for the same campaign at the same time stay consistent? If so, this tells you that Personalize is maintaining response time as request volume increases and therefore scaling fast enough to meet demand. However, if latency increases significantly and to an unacceptable level for your application, this is an indication that Personalize may not be scaling fast enough. See the answer to the following question for some options. 98 | 99 | ***Q: My workload is very spikey and Personalize is not scaling fast enough. What can I do?*** 100 | 101 | ***A:*** First, be sure to confirm that it is Personalize that is not scaling fast enough by reviewing the answer above. If the spikes are predictable or cyclical, you can pre-warm capacity in your campaign ahead of time by adjusting the `minProvisionedTPS` using the [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API and then dropping it back down after the traffic subsides. For example, increase capacity 30 minutes before a flash sale or marketing campaign is launched that brings a temporary surge in traffic. This can be done manually using the AWS console or automated by using [CloudWatch events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/WhatIsCloudWatchEvents.html) based on a schedule or triggered based on an event in your application. The [personalize_update_campaign_tps](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/personalize_update_campaign_tps_function/) function that is deployed with this application can be used as the target for CloudWatch events or you can publish an `UpdatePersonalizeCampaignMinProvisionedTPS` event to EventBridge. If spikes in your workload are not predictable or known ahead of time, determining the optimal `minProvisionedTPS` to balance consistent latency vs cost is the best option. The metrics and dashboard graphs in this application can help you determine this value. 102 | 103 | ***Q: After deploying this application in my AWS account, I created some new Personalize campaigns that I also want to monitor. How can I add them to be monitored and have them appear on my dashboard? Also, what about monitoried campaigns that I delete?*** 104 | 105 | ***A:*** If you specified `all` for the `CampaignARNs` deployment parameter (see installation instructions above), any new campaigns you create will be automatically monitored and alarms created (if `AutoCreateAlarms` was set to `Yes`) when the campaigns become active. Likewise, any campaigns that are deleted will no longer be monitored. If you want this application to monitor campaigns across multiple regions, be sure to specify the region names in the `Regions` deployment parameter. Note that this only applies when `CampaignARNs` is set to `all`. The CloudWatch dashboard will be automatically rebuilt ever hour to add new campaigns and drop deleted campaigns. You can also trigger the dashboard to be rebuilt by publishing a `BuildPersonalizeMonitorDashboard` event to the default EventBridge event bus (see [dashboard_mgmt_function](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/dashboard_mgmt_function/)). 106 | 107 | ## Reporting issues 108 | 109 | If you encounter a bug, please create a new issue with as much detail as possible and steps for reproducing the bug. Similarly, if you have an idea for an improvement, please add an issue as well. Pull requests are also welcome! See the [Contributing Guidelines](https://github.com/aws-samples/amazon-personalize-monitor/tree/master/src/CONTRIBUTING.md) for more details. 110 | 111 | ## License summary 112 | 113 | This sample code is made available under a modified MIT license. See the LICENSE file. 114 | -------------------------------------------------------------------------------- /template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | Personalize monitoring tools including CloudWatch metrics, alarms, and dashboard; optional automated cost optimization 5 | 6 | Metadata: 7 | AWS::ServerlessRepo::Application: 8 | Name: Amazon-Personalize-Monitor 9 | Description: > 10 | Creates a CloudWatch dashboard for monitoring the utilization of Amazon Personalize 11 | campaigns, creates CloudWatch alarms based on a user-defined threshold, and 12 | includes automated cost optimization actions. 13 | Author: AWS Applied AI - Personalize 14 | SpdxLicenseId: MIT-0 15 | LicenseUrl: LICENSE 16 | ReadmeUrl: README-SAR.md 17 | Labels: ['Personalize', 'CloudWatch', 'Monitoring'] 18 | HomePageUrl: https://github.com/aws-samples/amazon-personalize-monitor 19 | SemanticVersion: 1.0.2 20 | SourceCodeUrl: https://github.com/aws-samples/amazon-personalize-monitor 21 | 22 | AWS::CloudFormation::Interface: 23 | ParameterGroups: 24 | - Label: 25 | default: "Amazon Personalize campaigns to monitor" 26 | Parameters: 27 | - CampaignARNs 28 | - Regions 29 | - Label: 30 | default: "CloudWatch alarm configuration" 31 | Parameters: 32 | - AutoCreateCampaignUtilizationAlarms 33 | - CampaignThresholdAlarmLowerBound 34 | - AutoCreateIdleCampaignAlarms 35 | - IdleCampaignThresholdHours 36 | - Label: 37 | default: "Cost optimization actions" 38 | Parameters: 39 | - AutoAdjustCampaignMinProvisionedTPS 40 | - AutoDeleteIdleCampaigns 41 | - Label: 42 | default: "Notifications" 43 | Parameters: 44 | - NotificationEndpoint 45 | ParameterLabels: 46 | CampaignARNs: 47 | default: "Personalize campaign ARNs to monitor" 48 | Regions: 49 | default: "AWS regions to monitor" 50 | AutoCreateCampaignUtilizationAlarms: 51 | default: "Automatically create campaign utilization CloudWatch alarms?" 52 | CampaignThresholdAlarmLowerBound: 53 | default: "Campaign utilization alarm lower bound threshold" 54 | AutoCreateIdleCampaignAlarms: 55 | default: "Automatically create idle campaign CloudWatch alarms?" 56 | IdleCampaignThresholdHours: 57 | default: "Number of hours without requests to be considered idle" 58 | AutoDeleteIdleCampaigns: 59 | default: "Automatically delete idle campaigns in idle alarm state?" 60 | AutoAdjustCampaignMinProvisionedTPS: 61 | default: "Automatically adjust/lower minProvisionedTPS for campaigns in utilization alarm state" 62 | NotificationEndpoint: 63 | default: "Email address to receive notifications" 64 | 65 | Parameters: 66 | CampaignARNs: 67 | Type: String 68 | Description: > 69 | Comma separated list of Amazon Personalize Campaign ARNs to monitor or 'all' to dynamically monitor all campaigns. 70 | Default: 'all' 71 | 72 | Regions: 73 | Type: String 74 | Description: > 75 | Comma separated list of AWS region names. When using 'all' for CampaignARNs, this parameter can be used 76 | to control the region(s) where the Personalize Monitor looks for active Personalize Campaigns. When not specified, 77 | the region where you deploy this application will be used. 78 | 79 | AutoCreateCampaignUtilizationAlarms: 80 | Type: String 81 | Description: > 82 | Whether to automatically create CloudWatch alarms for campaign utilization for monitored campaigns. Valid values: Yes/No. 83 | AllowedValues: 84 | - 'Yes' 85 | - 'No' 86 | Default: 'Yes' 87 | 88 | CampaignThresholdAlarmLowerBound: 89 | Type: Number 90 | Description: > 91 | Campaign utilization alarm threshold value (in percent). When a monitored campaign's utilization falls below this value, 92 | the alarm state will be set to ALARM. Valid values: 0-1000 (integer). 93 | MinValue: 0 94 | MaxValue: 1000 95 | Default: 100 96 | 97 | AutoAdjustCampaignMinProvisionedTPS: 98 | Type: String 99 | Description: > 100 | Whether to automatically adjust minProvisionedTPS down to lowest average TPS over rolling 24 hour window. The 101 | minProvisionedTPS will never be increased. Valid values: Yes/No. 102 | AllowedValues: 103 | - 'Yes' 104 | - 'No' 105 | Default: 'Yes' 106 | 107 | AutoCreateIdleCampaignAlarms: 108 | Type: String 109 | Description: > 110 | Whether to automatically create CloudWatch alarms for detecting idle campaigns. Valid values: Yes/No. 111 | AllowedValues: 112 | - 'Yes' 113 | - 'No' 114 | Default: 'Yes' 115 | 116 | IdleCampaignThresholdHours: 117 | Type: Number 118 | Description: > 119 | Number of consecutive idle hours before a campaign is automatically deleted only if AutoDeleteIdleCampaigns is Yes. Valid values: 2-48 (integer). 120 | MinValue: 2 121 | MaxValue: 48 122 | Default: 24 123 | 124 | AutoDeleteIdleCampaigns: 125 | Type: String 126 | Description: > 127 | Whether to automatically delete campaigns that have been idle for IdleCampaignThresholdHours consecutive hours. Valid values: Yes/No. 128 | AllowedValues: 129 | - 'Yes' 130 | - 'No' 131 | Default: 'No' 132 | 133 | NotificationEndpoint: 134 | Type: String 135 | Description: > 136 | Email address to receive CloudWatch alarm and other monitoring notifications. 137 | 138 | Globals: 139 | Function: 140 | Timeout: 5 141 | Runtime: python3.8 142 | 143 | Resources: 144 | NotificationsTopic: 145 | Type: AWS::SNS::Topic 146 | Properties: 147 | DisplayName: 'Personalize Monitor Notifications' 148 | Subscription: 149 | - Endpoint: !Ref NotificationEndpoint 150 | Protocol: email 151 | Tags: 152 | - Key: 'CreatedBy' 153 | Value: 'PersonalizeMonitor' 154 | TopicName: PersonalizeMonitorNotifications 155 | 156 | NotificationsTopicPolicy: 157 | Type: AWS::SNS::TopicPolicy 158 | Properties: 159 | PolicyDocument: 160 | Statement: 161 | - Sid: PublishPolicy 162 | Effect: Allow 163 | Principal: 164 | Service: 165 | - cloudwatch.amazonaws.com 166 | - events.amazonaws.com 167 | Action: 'sns:Publish' 168 | Resource: !Ref NotificationsTopic 169 | Topics: 170 | - !Ref NotificationsTopic 171 | 172 | NotificationsRule: 173 | Type: AWS::Events::Rule 174 | Properties: 175 | Description: Routes Personalize Monitor notifications to notification SNS topic 176 | EventPattern: 177 | source: 178 | - personalize.monitor 179 | detail-type: 180 | - PersonalizeCampaignMinProvisionedTPSUpdated 181 | - PersonalizeCampaignDeleted 182 | State: ENABLED 183 | Targets: 184 | - Arn: !Ref NotificationsTopic 185 | Id: PersonalizeMonitorNotificationsId 186 | InputTransformer: 187 | InputPathsMap: 188 | reason: "$.detail.Reason" 189 | resources: "$.resources" 190 | type: "$.detail-type" 191 | InputTemplate: | 192 | "Amazon Personalize monitor notification:" 193 | "" 194 | "Message type: " 195 | "Resource(s): " 196 | "Reason: " 197 | 198 | CommonLayer: 199 | Type: AWS::Serverless::LayerVersion 200 | Properties: 201 | ContentUri: src/layer 202 | CompatibleRuntimes: 203 | - python3.8 204 | Metadata: 205 | BuildMethod: python3.8 206 | 207 | MonitorFunction: 208 | Type: AWS::Serverless::Function 209 | Properties: 210 | Description: Amazon Personalize monitor function that updates custom CloudWatch metrics and monitors campaign utilization every 5 minutes 211 | Timeout: 30 212 | CodeUri: src/personalize_monitor_function 213 | Handler: personalize_monitor.lambda_handler 214 | Layers: 215 | - !Ref CommonLayer 216 | Policies: 217 | - Statement: 218 | - Sid: PersonalizePolicy 219 | Effect: Allow 220 | Action: 221 | - personalize:DescribeCampaign 222 | - personalize:DescribeSolutionVersion 223 | - personalize:ListCampaigns 224 | Resource: '*' 225 | - Sid: CloudWatchPolicy 226 | Effect: Allow 227 | Action: 228 | - cloudwatch:DescribeAlarmsForMetric 229 | - cloudwatch:DisableAlarmActions 230 | - cloudwatch:EnableAlarmActions 231 | - cloudwatch:GetMetricData 232 | - cloudwatch:PutMetricAlarm 233 | - cloudwatch:PutMetricData 234 | Resource: '*' 235 | - Sid: EventBridgePolicy 236 | Effect: Allow 237 | Action: 238 | - events:PutEvents 239 | Resource: '*' 240 | Events: 241 | ScheduledEvent: 242 | Type: Schedule 243 | Properties: 244 | Description: Triggers primary Personalize Monitor monitoring logic 245 | Schedule: cron(0/5 * * * ? *) 246 | Enabled: True 247 | Environment: 248 | Variables: 249 | CampaignARNs: !Ref CampaignARNs 250 | Regions: !Ref Regions 251 | NotificationsTopic: !Ref NotificationsTopic 252 | AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms 253 | CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound 254 | AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms 255 | IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours 256 | AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns 257 | AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS 258 | 259 | DashboardManagementFunction: 260 | Type: AWS::Serverless::Function 261 | Properties: 262 | Description: Amazon Personalize monitor function that updates the CloudWatch dashboard hourly and when campaigns are added/deleted 263 | Timeout: 15 264 | CodeUri: src/dashboard_mgmt_function 265 | Handler: dashboard_mgmt.lambda_handler 266 | AutoPublishAlias: live 267 | Layers: 268 | - !Ref CommonLayer 269 | Policies: 270 | - Statement: 271 | - Sid: PersonalizePolicy 272 | Effect: Allow 273 | Action: 274 | - personalize:DescribeCampaign 275 | - personalize:DescribeDatasetGroup 276 | - personalize:DescribeSolutionVersion 277 | - personalize:ListCampaigns 278 | Resource: '*' 279 | - Sid: DashboardPolicy 280 | Effect: Allow 281 | Action: 282 | - cloudwatch:DeleteDashboards 283 | - cloudwatch:PutDashboard 284 | Resource: '*' 285 | Environment: 286 | Variables: 287 | CampaignARNs: !Ref CampaignARNs 288 | Regions: !Ref Regions 289 | Events: 290 | EBRule: 291 | Type: EventBridgeRule 292 | Properties: 293 | Pattern: 294 | source: 295 | - personalize.monitor 296 | detail-type: 297 | - BuildPersonalizeMonitorDashboard 298 | ScheduledEvent: 299 | Type: Schedule 300 | Properties: 301 | Description: Hourly rebuild of Personalize Monitor CloudWatch dashboard 302 | Schedule: cron(3 * * * ? *) 303 | Enabled: True 304 | 305 | DeployDashboardCustomResource: 306 | Type: Custom::DashboardCreate 307 | Properties: 308 | ServiceToken: !GetAtt DashboardManagementFunction.Arn 309 | CampaignARNs: !Ref CampaignARNs 310 | Regions: !Ref Regions 311 | NotificationsTopic: !Ref NotificationsTopic 312 | AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms 313 | CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound 314 | AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms 315 | IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours 316 | AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns 317 | AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS 318 | 319 | UpdateCampaignTPSFunction: 320 | Type: AWS::Serverless::Function 321 | Properties: 322 | Description: Amazon Personalize monitor function that updates the minProvisionedTPS for a campaign in response to an event 323 | CodeUri: src/personalize_update_campaign_tps_function 324 | Handler: personalize_update_campaign_tps.lambda_handler 325 | Layers: 326 | - !Ref CommonLayer 327 | Policies: 328 | - Statement: 329 | - Sid: PersonalizePolicy 330 | Effect: Allow 331 | Action: 332 | - personalize:UpdateCampaign 333 | Resource: '*' 334 | - Sid: EventBridgePolicy 335 | Effect: Allow 336 | Action: 337 | - events:PutEvents 338 | Resource: '*' 339 | Events: 340 | EBRule: 341 | Type: EventBridgeRule 342 | Properties: 343 | Pattern: 344 | source: 345 | - personalize.monitor 346 | detail-type: 347 | - UpdatePersonalizeCampaignMinProvisionedTPS 348 | 349 | DeleteCampaignFunction: 350 | Type: AWS::Serverless::Function 351 | Properties: 352 | Description: Amazon Personalize monitor function that deletes a campaign in response to an event 353 | CodeUri: src/personalize_delete_campaign_function 354 | Handler: personalize_delete_campaign.lambda_handler 355 | Layers: 356 | - !Ref CommonLayer 357 | Policies: 358 | - Statement: 359 | - Sid: PersonalizePolicy 360 | Effect: Allow 361 | Action: 362 | - personalize:DeleteCampaign 363 | Resource: '*' 364 | - Sid: EventBridgePolicy 365 | Effect: Allow 366 | Action: 367 | - events:PutEvents 368 | Resource: '*' 369 | - Sid: CloudWatchFindAlarmsPolicy 370 | Effect: Allow 371 | Action: 372 | - cloudwatch:DescribeAlarms 373 | - cloudwatch:ListTagsForResource 374 | Resource: '*' 375 | - Sid: CloudWatchDeletePolicy 376 | Effect: Allow 377 | Action: 378 | - cloudwatch:DeleteAlarms 379 | Resource: !Sub 'arn:aws:cloudwatch:*:${AWS::AccountId}:alarm:PersonalizeMonitor-*' 380 | Events: 381 | EBCustomRule: 382 | Type: EventBridgeRule 383 | Properties: 384 | Pattern: 385 | source: 386 | - personalize.monitor 387 | detail-type: 388 | - DeletePersonalizeCampaign 389 | 390 | CleanupFunction: 391 | Type: AWS::Serverless::Function 392 | Properties: 393 | Description: Amazon Personalize monitor custom resource function that cleans up directly created resources when the application is deleted 394 | Timeout: 15 395 | CodeUri: src/cleanup_resources_function 396 | Handler: cleanup_resources.lambda_handler 397 | AutoPublishAlias: live 398 | Layers: 399 | - !Ref CommonLayer 400 | Policies: 401 | - Statement: 402 | - Sid: PersonalizePolicy 403 | Effect: Allow 404 | Action: 405 | - personalize:ListCampaigns 406 | Resource: '*' 407 | - Sid: CloudWatchFindAlarmsPolicy 408 | Effect: Allow 409 | Action: 410 | - cloudwatch:DescribeAlarms 411 | - cloudwatch:ListTagsForResource 412 | Resource: '*' 413 | - Sid: CloudWatchDeletePolicy 414 | Effect: Allow 415 | Action: 416 | - cloudwatch:DeleteAlarms 417 | Resource: !Sub 'arn:aws:cloudwatch:*:${AWS::AccountId}:alarm:PersonalizeMonitor-*' 418 | Environment: 419 | Variables: 420 | CampaignARNs: !Ref CampaignARNs 421 | Regions: !Ref Regions 422 | 423 | CleanupCustomResource: 424 | Type: Custom::Cleanup 425 | Properties: 426 | ServiceToken: !GetAtt CleanupFunction.Arn 427 | CampaignARNs: !Ref CampaignARNs 428 | Regions: !Ref Regions 429 | NotificationsTopic: !Ref NotificationsTopic 430 | AutoCreateCampaignUtilizationAlarms: !Ref AutoCreateCampaignUtilizationAlarms 431 | CampaignThresholdAlarmLowerBound: !Ref CampaignThresholdAlarmLowerBound 432 | AutoCreateIdleCampaignAlarms: !Ref AutoCreateIdleCampaignAlarms 433 | IdleCampaignThresholdHours: !Ref IdleCampaignThresholdHours 434 | AutoDeleteIdleCampaigns: !Ref AutoDeleteIdleCampaigns 435 | AutoAdjustCampaignMinProvisionedTPS: !Ref AutoAdjustCampaignMinProvisionedTPS 436 | 437 | Outputs: 438 | MonitorFunction: 439 | Description: "Personalize monitor Function ARN" 440 | Value: !GetAtt MonitorFunction.Arn 441 | 442 | DashboardManagementFunction: 443 | Description: "CloudWatch Dashboard Management Function ARN" 444 | Value: !GetAtt DashboardManagementFunction.Arn 445 | 446 | UpdateCampaignTPSFunction: 447 | Description: "Update Personalize Campaign minProvisionedTPS Function ARN" 448 | Value: !GetAtt UpdateCampaignTPSFunction.Arn 449 | 450 | DeleteCampaignFunction: 451 | Description: "Delete Personalize Campaign Function ARN" 452 | Value: !GetAtt DeleteCampaignFunction.Arn 453 | 454 | NotificationsTopic: 455 | Description: "Notification SNS Topic ARN" 456 | Value: !Ref NotificationsTopic 457 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon Personalize Monitor 2 | 3 | 4 | * [Why is this important?](#Whyisthisimportant) 5 | * [Features](#Features) 6 | * [CloudWatch dashboard](#CloudWatchdashboard) 7 | * [CloudWatch alarms](#CloudWatchalarms) 8 | * [CloudWatch metrics](#CloudWatchmetrics) 9 | * [Cost optimization (optional)](#Costoptimizationoptional) 10 | * [Idle campaigns](#Idlecampaigns) 11 | * [Over-provisioned campaigns](#Over-provisionedcampaigns) 12 | * [Architecture](#Architecture) 13 | * [Installing the application](#Installingtheapplication) 14 | * [Option 1 - Install from Serverless Application Repository](#Option1-InstallfromServerlessApplicationRepository) 15 | * [Option 2 - Install using Serverless Application Model](#Option2-InstallusingServerlessApplicationModel) 16 | * [Application settings/parameters](#Applicationsettingsparameters) 17 | * [Uninstalling the application](#Uninstallingtheapplication) 18 | * [FAQs](#FAQs) 19 | * [Reporting issues](#Reportingissues) 20 | * [License summary](#Licensesummary) 21 | 22 | 26 | 27 | 28 | This project contains the source code and supporting files for deploying a serverless application that adds monitoring, alerting, and optimzation capabilities for [Amazon Personalize](https://aws.amazon.com/personalize/), an AI service from AWS that allows you to create custom ML recommenders based on your data. Highlights include: 29 | 30 | - Generation of additional [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) to track the Average TPS, `minProvisionedTPS`, and Utilization of Personalize [campaigns](https://docs.aws.amazon.com/personalize/latest/dg/campaigns.html) over time. 31 | - [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) to alert you via SNS/email when campaign utilization drops below a configurable threshold or has been idle for a configurable length of time (optional). 32 | - [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) populated with graph widgets for Actual vs Provisioned TPS, Campaign Utilization, Campaign Latency, and the number of campaigns being monitored. 33 | - Capable of monitoring campaigns across multiple regions in the same AWS account. 34 | - Automatically delete campaigns that have been idle more than a configurable number of hours (optional). 35 | - Automatically reduce the `minProvisionedTPS` for over-provisioned campaigns to optimize cost (optional). 36 | 37 | ## Why is this important? 38 | 39 | Once you create a solution and solution version based on your data, an Amazon Personalize campaign can be created that allows you to retrieve recommendations in real-time based on the solution version. This is typically how Personalize is integrated into your applications. When an application needs to display personalized recommendations to a user, a [GetRecommendations](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#recommendations) or [GetPersonalizedRanking](https://docs.aws.amazon.com/personalize/latest/dg/getting-real-time-recommendations.html#rankings) API call is made to a campaign to retrieve recommendations. Just like monitoring your own application components is important, monitoring your Personalize campaigns is also important and considered a best practice. This application is designed to help you do just that. 40 | 41 | When you provision a campaign using the [CreateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_CreateCampaign.html) API, you must specify a value for `minProvisionedTPS`. This value specifies the requested _minimum_ provisioned transactions (calls) per second that Amazon Personalize will support for that campaign. As your actual request volume to a campaign approaches its `minProvisionedTPS`, Personalize will automatically provision additional resources to meet your request volume. Then when request volume drops, Personalize will automatically scale back down **no lower** than `minProvisionedTPS`. **Since you are billed based on the higher of actual TPS and `minProvisionedTPS`, it is therefore important to not over-provision your campaigns to optimize cost.** This also means that leaving a campaign idle (active but no longer in-use) will result in unnecessary charges. This application gives you the tools to visualize your campaign utilization, to be notified when there is an opportunity to tune your campaign provisioning, and even take action to reduce and eliminate over-provisioning. 42 | 43 | > General best practice is to set `minProvisionedTPS` to `1`, or your low watermark for campaign recommendations requests, and let Personalize auto-scale campaign resources to meet actual demand. 44 | 45 | See the Amazon Personalize [pricing page](https://aws.amazon.com/personalize/pricing/) for full details on costs. 46 | 47 | ## Features 48 | 49 | ### CloudWatch dashboard 50 | 51 | When you deploy this application, a [CloudWatch dashboard](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Dashboards.html) is built with widgets for Actual vs Provisioned TPS, Campaign Utilization, and Campaign Latency for the campaigns you wish to monitor. The dashboard gives you critical visual information to assess how your campaigns are performing and being utilized. The data in these graphs can help you properly tune your campaign's `minProvisionedTPS`. 52 | 53 | ![Personalize Monitor CloudWatch Dashboard](./images/personalize-monitor-cloudwatch-dashboard.png) 54 | 55 | For more details on the CloudWatch dashboard created and maintained by this application, see the [dashboard_mgmt](./src/dashboard_mgmt_function/) function page. 56 | 57 | ### CloudWatch alarms 58 | 59 | At deployment time, you can optionally have this application automatically create [CloudWatch alarms](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html) that will alert you when a monitored campaign's utilization drops below a threshold you define for nine out of twelve evaluation periods. Since the intervals are 5 minutes, that means that nine of the 5 minute evaluations over a 1 hour span must be below the threshold to enter an alarm status. The same rule applies to transition from alarm to OK status. Similarly, the idle campaign alarm will alert you when there has been no request activity for a campaign for a configurable amount of time. The alarms will be setup to alert you via email through an SNS topic. Once the alarms are setup, you can alternatively link them to any operations and messaging tools you already use (i.e. Slack, PagerDuty, etc). 60 | 61 | ![Personalize Monitor CloudWatch Alarms](./images/personalize-monitor-cloudwatch-alarms.png) 62 | 63 | For more details on the CloudWatch alarms created by this application, see the [personalize_monitor](./src/personalize_monitor_function/) function page. 64 | 65 | ### CloudWatch metrics 66 | 67 | To support the CloudWatch dashboard and alarms described above, a few new custom [CloudWatch metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/working_with_metrics.html) are added for the monitored campaigns. These metrics are populated by the [personalize_monitor](./src/personalize_monitor_function/) Lambda function that is setup to run every 5 minutes in your account. You can find these metrics in CloudWatch under Metrics in the "PersonalizeMonitor" namespace. 68 | 69 | ![Personalize Monitor CloudWatch Metrics](./images/personalize-monitor-cloudwatch-metrics.png) 70 | 71 | For more details on the custom metrics created by this application, see the [personalize_monitor](./src/personalize_monitor_function/) function page. 72 | 73 | ### Cost optimization (optional) 74 | 75 | This application can be optionally configured to automatically perform cost optimization actions for your Amazon Personalize campaigns. 76 | 77 | #### Idle campaigns 78 | Idle campaigns are those that have been provisioned but are not receiving any `GetRecommendations`/`GetPersonalizedRanking` calls. Since costs are incurred while a campaign is active regardless of whether it receives any requests, detecting and eliminating these idle campaigns can be an important cost optimization activity. This can be particularly useful in non-production AWS accounts such as development and testing where you are more likely to have abandoned campaigns. See the `AutoDeleteIdleCampaigns` and `IdleCampaignThresholdHours` deployment parameters in the installation instructions below and the [personalize_monitor](./src/personalize_monitor_function#automatically-deleting-idle-campaigns-optional) function for details. 79 | 80 | #### Over-provisioned campaigns 81 | 82 | Properly provisioning campaigns, as described earlier, is also an important cost optimization activity. This application can be configured to automatically reduce a campaign's `minProvisionedTPS` based on actual request volume. This will optimize a campaign's utilization when request volume is lower while relying on Personalize to auto-scale based on actual activity. See the `AutoAdjustCampaignMinProvisionedTPS` deployment parameter below and the [personalize_monitor](./src/personalize_monitor_function#automatically-adjusting-campaign-minprovisionedtps-optional) function for details. 83 | 84 | ## Architecture 85 | 86 | The following diagram depicts how the Lambda functions in this application work together using an event-driven approach built on [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/what-is-amazon-eventbridge.html). The [personalize_monitor](./src/personalize_monitor_function/) function is invoked every five minutes to generate CloudWatch metric data based on the monitored campaigns and create alarms (if configured). It also generates events which are published to EventBridge that trigger activities such as optimizing a campaign's `minProvisionedTPS`, deleting idle campaigns, updating the Personalize Monitor CloudWatch dashboard, and sending notifications. This approach allows you to more easily integrate these functions into your own operations by sending your own events, say, to trigger the dashboard to be rebuilt after you create a campaign or register your own targets to events generated by this application. 87 | 88 | ![Personalize Monitor Architecture](./images/personalize-monitor-architecture.png) 89 | 90 | See the readme pages for each function for details on the events that they produce and consume. 91 | 92 | ## Installing the application 93 | 94 | ***IMPORTANT NOTE:** Deploying this application in your AWS account will create and consume AWS resources, which will cost money. For example, the CloudWatch dashboard, the Lambda function that collects additional monitoring metrics is run every 5 minutes, CloudWatch alarms, logging, and so on. Therefore, if after installing this application you choose not to use it as part of your monitoring strategy, be sure to follow the Uninstall instructions below to clean up all resources and avoid ongoing charges.* 95 | 96 | ### Option 1 - Install from Serverless Application Repository 97 | 98 | The easiest way to deploy this application is from the [Serverless Application Repository](https://aws.amazon.com/serverless/serverlessrepo/) (SAR). 99 | 100 | 1. Within the AWS account where you wish to deploy the application, browse to the [application's page](https://serverlessrepo.aws.amazon.com/applications/arn:aws:serverlessrepo:us-east-1:316031960777:applications~Amazon-Personalize-Monitor) in the Serverless Application Repository and click **"Deploy"**. 101 | 2. Enter/update values in the **"Application settings"** panel (described below) and click **"Deploy"** again. 102 | 103 | ### Option 2 - Install using Serverless Application Model 104 | 105 | If you'd rather install the application manually, you can use the AWS [Serverless Application Model](https://aws.amazon.com/serverless/sam/) (SAM) CLI to build and deploy the application into your AWS account. 106 | 107 | To use the SAM CLI, you need the following tools. 108 | 109 | * SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) 110 | * [Python 3 installed](https://www.python.org/downloads/) 111 | * Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community) 112 | 113 | To build and deploy the application for the first time, run the following in your shell: 114 | 115 | ```bash 116 | sam build --use-container --cached 117 | sam deploy --guided 118 | ``` 119 | 120 | The first command will build the source of the application. The second command will package and deploy the application to your AWS account with a series of prompts. The following section describes the supported application parameters. 121 | 122 | ### Application settings/parameters 123 | 124 | Whether you install this application from SAR or SAM, the following parameters can be used to control how the application monitors your Personalize deployments. 125 | 126 | | Parameter | Description | Default | 127 | | --- | --- | --- | 128 | | CampaignARNs | Comma separated list of Personalize campaign ARNs to monitor or `all` to monitor all active campaigns. It is recommended to use `all` so that any new campaigns that are added after deployment will be automatically detected, monitored, and have alarms created (optional) | `all` | 129 | | Regions | Comma separated list of AWS regions to monitor campaigns. Only applicable when `all` is used for `CampaignARNs`. Leaving this value blank will default to the region where this application is deployed (i.e. `AWS Region` parameter above). | | 130 | | AutoCreateCampaignUtilizationAlarms | Whether to automatically create a utilization CloudWatch alarm for each monitored campaign. | `Yes` | 131 | | CampaignThresholdAlarmLowerBound | Minimum threshold value (in percent) to enter alarm state for campaign utilization. This value is only relevant if `AutoCreateAlarms` is `Yes`. | `100` | 132 | | AutoAdjustCampaignMinProvisionedTPS | Whether to automatically compare campaign request activity against the campaign's `minProvisionedTPS` to determine if `minProvisionedTPS` can be reduced to optimize utilization. | `Yes` | 133 | | AutoCreateIdleCampaignAlarms | Whether to automatically create a idle detection CloudWatch alarm for each monitored campaign. | `Yes` | 134 | | IdleCampaignThresholdHours | Number of hours that a campaign must be idle (i.e. no requests) before it is automatically deleted. `AutoDeleteIdleCampaigns` must be `Yes` for idle campaign deletion to occur. | `24` | 135 | | AutoDeleteIdleCampaigns | Whether to automatically delete idle campaigns. An idle campaign is one that has not had any requests in `IdleCampaignThresholdHours` hours. | `No` | 136 | | NotificationEndpoint | Email address to receive alarm and ok notifications and campaign delete and update events (optional). An [SNS](https://aws.amazon.com/sns/) topic is created and this email address will be added as a subscriber to that topic. You will receive a confirmation email for the SNS topic subscription so be sure to click the confirmation link in that email to ensure you receive notifications. | | 137 | 138 | ## Uninstalling the application 139 | 140 | If you installed the application from the Serverless Application Repository, you can delete the application from the Lambda console in your AWS account (under Applications). 141 | 142 | Alternatively, if you installed the application using SAM, you can delete the application using the AWS CLI. Assuming you used the default application name for the stack name (`personalize-monitor`), you can run the following: 143 | 144 | ```bash 145 | aws cloudformation delete-stack --stack-name personalize-monitor 146 | ``` 147 | 148 | You can also delete the application stack in CloudFormation in the AWS console. 149 | 150 | ## FAQs 151 | 152 | ***Q: Can I use this application to determine my accumulated inference charges during the month?*** 153 | 154 | ***A:*** No! Although the `actualTPS` and `minProvisionedTPS` custom metrics generated by this application may be used to calculate an approximation of your accumulated inference charges, it should **never** be used as a substitute or proxy for actual Personalize inference costs. Always consult your AWS Billing Dashboard for actual service charges. 155 | 156 | ***Q: What is an ideal campaign utilization percentage? Is it okay if my campaign utilization is over 100%?*** 157 | 158 | ***A:*** The campaign utilization metric is a measure of your actual campaign usage compared against the `minProvisionedTPS` for the campaign. Any utilization value >= 100% is ideal since that means you are not over-provisioning, and therefore not over-paying, for campaign resources. You're letting Personalize handle the scaling in/out of the campaign. Anytime your utilization is below 100%, more resources are provisioned than are needed to satisfy the volume of requests at that time. 159 | 160 | ***Q: How can I tell if Personalize is scaling out fast enough?*** 161 | 162 | ***A:*** Compare the "Actual vs Provisioned TPS" graph to the "Campaign Latency" graph on the Personalize Monitor CloudWatch dashboard. When your Actual TPS increases/spikes for a campaign, does the latency for the same campaign at the same time stay consistent? If so, this tells you that Personalize is maintaining response time as request volume increases and therefore scaling fast enough to meet demand. However, if latency increases significantly and to an unacceptable level for your application, this is an indication that Personalize may not be scaling fast enough. See the answer to the following question for some options. 163 | 164 | ***Q: My workload is very spikey and Personalize is not scaling fast enough. What can I do?*** 165 | 166 | ***A:*** First, be sure to confirm that it is Personalize that is not scaling fast enough by reviewing the answer above. If the spikes are predictable or cyclical, you can pre-warm capacity in your campaign ahead of time by adjusting the `minProvisionedTPS` using the [UpdateCampaign](https://docs.aws.amazon.com/personalize/latest/dg/API_UpdateCampaign.html) API and then dropping it back down after the traffic subsides. For example, increase capacity 30 minutes before a flash sale or marketing campaign is launched that brings a temporary surge in traffic. This can be done manually using the AWS console or automated by using [CloudWatch events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/WhatIsCloudWatchEvents.html) based on a schedule or triggered based on an event in your application. The [personalize_update_campaign_tps](./src/personalize_update_campaign_tps_function/) function that is deployed with this application can be used as the target for CloudWatch events or you can publish an `UpdatePersonalizeCampaignMinProvisionedTPS` event to EventBridge. If spikes in your workload are not predictable or known ahead of time, determining the optimal `minProvisionedTPS` to balance consistent latency vs cost is the best option. The metrics and dashboard graphs in this application can help you determine this value. 167 | 168 | ***Q: After deploying this application in my AWS account, I created some new Personalize campaigns that I also want to monitor. How can I add them to be monitored and have them appear on my dashboard? Also, what about monitoried campaigns that I delete?*** 169 | 170 | ***A:*** If you specified `all` for the `CampaignARNs` deployment parameter (see installation instructions above), any new campaigns you create will be automatically monitored and alarms created (if `AutoCreateAlarms` was set to `Yes`) when the campaigns become active. Likewise, any campaigns that are deleted will no longer be monitored. If you want this application to monitor campaigns across multiple regions, be sure to specify the region names in the `Regions` deployment parameter. Note that this only applies when `CampaignARNs` is set to `all`. The CloudWatch dashboard will be automatically rebuilt ever hour to add new campaigns and drop deleted campaigns. You can also trigger the dashboard to be rebuilt by publishing a `BuildPersonalizeMonitorDashboard` event to the default EventBridge event bus (see [dashboard_mgmt_function](./src/dashboard_mgmt_function/)). 171 | 172 | If you want to change your deployment parameters that control what campaigns are monitored, redeploy the application using the installation option selected above. 173 | 174 | **IMPORTANT: Redeploying this application will fully rebuild and replace your Personalize Monitor dashboard so any changes you made manually to the dashboard will be lost.** 175 | 176 | ## Reporting issues 177 | 178 | If you encounter a bug, please create a new issue with as much detail as possible and steps for reproducing the bug. Similarly, if you have an idea for an improvement, please add an issue as well. Pull requests are also welcome! See the [Contributing Guidelines](./CONTRIBUTING.md) for more details. 179 | 180 | ## License summary 181 | 182 | This sample code is made available under a modified MIT license. See the LICENSE file. 183 | -------------------------------------------------------------------------------- /src/personalize_monitor_function/personalize_monitor.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | """Lambda function that records Personalize resource metrics 5 | 6 | Lambda function designed to be called every five minutes to record campaign TPS 7 | utilization metrics in CloudWatch. The metrics are used for alarms and on the 8 | CloudWatch dashboard created by this application. 9 | """ 10 | 11 | import json 12 | import boto3 13 | import os 14 | import datetime 15 | import sys 16 | import math 17 | 18 | from botocore.exceptions import ClientError 19 | from aws_lambda_powertools import Logger 20 | 21 | from common import ( 22 | PROJECT_NAME, 23 | ALARM_NAME_PREFIX, 24 | extract_region, 25 | get_client, 26 | determine_campaign_arns, 27 | get_configured_active_campaigns, 28 | put_event 29 | ) 30 | 31 | logger = Logger() 32 | 33 | MAX_METRICS_PER_CALL = 20 34 | MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS = 1 35 | 36 | ALARM_PERIOD_SECONDS = 300 37 | ALARM_NAME_PREFIX_LOW_UTILIZATION = ALARM_NAME_PREFIX + 'LowCampaignUtilization-' 38 | ALARM_NAME_PREFIX_IDLE = ALARM_NAME_PREFIX + 'IdleCampaign-' 39 | 40 | def get_campaign_recipe_arn(campaign): 41 | recipe_arn = campaign.get('recipeArn') 42 | if not recipe_arn: 43 | campaign_region = extract_region(campaign['campaignArn']) 44 | personalize = get_client('personalize', campaign_region) 45 | 46 | response = personalize.describe_solution_version(solutionVersionArn = campaign['solutionVersionArn']) 47 | 48 | recipe_arn = response['solutionVersion']['recipeArn'] 49 | campaign['recipeArn'] = recipe_arn 50 | 51 | return recipe_arn 52 | 53 | def get_campaign_inference_metric_name(campaign): 54 | metric_name = 'GetRecommendations' 55 | if get_campaign_recipe_arn(campaign) == 'arn:aws:personalize:::recipe/aws-personalized-ranking': 56 | metric_name = 'GetPersonalizedRanking' 57 | 58 | return metric_name 59 | 60 | def get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period): 61 | campaign_region = extract_region(campaign['campaignArn']) 62 | cw = get_client(service_name = 'cloudwatch', region_name = campaign_region) 63 | 64 | metric_name = get_campaign_inference_metric_name(campaign) 65 | 66 | response = cw.get_metric_data( 67 | MetricDataQueries = [ 68 | { 69 | 'Id': 'm1', 70 | 'MetricStat': { 71 | 'Metric': { 72 | 'Namespace': 'AWS/Personalize', 73 | 'MetricName': metric_name, 74 | 'Dimensions': [ 75 | { 76 | 'Name': 'CampaignArn', 77 | 'Value': campaign['campaignArn'] 78 | } 79 | ] 80 | }, 81 | 'Period': period, 82 | 'Stat': 'Sum' 83 | }, 84 | 'ReturnData': True 85 | } 86 | ], 87 | StartTime = start_time, 88 | EndTime = end_time, 89 | ScanBy = 'TimestampDescending' 90 | ) 91 | 92 | datapoints = [] 93 | 94 | if response.get('MetricDataResults') and len(response['MetricDataResults']) > 0: 95 | results = response['MetricDataResults'][0] 96 | 97 | for idx, ts in enumerate(results['Timestamps']): 98 | datapoints.append({ 99 | 'Timestamp': ts, 100 | 'Value': results['Values'][idx] 101 | }) 102 | 103 | return datapoints 104 | 105 | def get_campaign_sum_requests_by_hour(campaign, start_time, end_time): 106 | datapoints = get_campaign_sum_requests_datapoints(campaign, start_time, end_time, 3600) 107 | return datapoints 108 | 109 | def get_campaign_total_requests(campaign, start_time, end_time, period): 110 | datapoints = get_campaign_sum_requests_datapoints(campaign, start_time, end_time, period) 111 | 112 | sum_requests = 0 113 | if datapoints: 114 | for datapoint in datapoints: 115 | sum_requests += datapoint['Value'] 116 | 117 | return sum_requests 118 | 119 | def get_campaign_average_tps(campaign, start_time, end_time, period = ALARM_PERIOD_SECONDS): 120 | sum_requests = get_campaign_total_requests(campaign, start_time, end_time, period) 121 | return sum_requests / period 122 | 123 | def get_campaign_age_hours(campaign): 124 | diff = datetime.datetime.now(datetime.timezone.utc) - campaign['creationDateTime'] 125 | days, seconds = diff.days, diff.seconds 126 | 127 | hours_age = days * 24 + seconds // 3600 128 | return hours_age 129 | 130 | def get_campaign_last_update_age_hours(campaign): 131 | hours_age = None 132 | if campaign.get('lastUpdatedDateTime'): 133 | diff = datetime.datetime.now(datetime.timezone.utc) - campaign['lastUpdatedDateTime'] 134 | days, seconds = diff.days, diff.seconds 135 | 136 | hours_age = days * 24 + seconds // 3600 137 | return hours_age 138 | 139 | def is_campaign_updatable(campaign): 140 | status = campaign['status'] 141 | updatable = status == 'ACTIVE' or status == 'CREATE FAILED' 142 | 143 | if updatable and campaign.get('latestCampaignUpdate'): 144 | status = campaign['latestCampaignUpdate']['status'] 145 | updatable = status == 'ACTIVE' or status == 'CREATE FAILED' 146 | 147 | return updatable 148 | 149 | def put_metrics(client, metric_datas): 150 | metric = { 151 | 'Namespace': PROJECT_NAME, 152 | 'MetricData': metric_datas 153 | } 154 | 155 | client.put_metric_data(**metric) 156 | logger.debug('Put data for %d metrics', len(metric_datas)) 157 | 158 | def append_metric(metric_datas_by_region, region, metric): 159 | metric_datas = metric_datas_by_region.get(region) 160 | 161 | if not metric_datas: 162 | metric_datas = [] 163 | metric_datas_by_region[region] = metric_datas 164 | 165 | metric_datas.append(metric) 166 | 167 | def create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound): 168 | cw = get_client(service_name = 'cloudwatch', region_name = campaign_region) 169 | 170 | response = cw.describe_alarms_for_metric( 171 | MetricName = 'campaignUtilization', 172 | Namespace = PROJECT_NAME, 173 | Dimensions=[ 174 | { 175 | 'Name': 'CampaignArn', 176 | 'Value': campaign['campaignArn'] 177 | }, 178 | ] 179 | ) 180 | 181 | alarm_name = ALARM_NAME_PREFIX_LOW_UTILIZATION + campaign['name'] 182 | 183 | low_utilization_alarm_exists = False 184 | # Only enable alarm actions when minTPS > 1 since we can't really do 185 | # anything to impact utilization by dropping minTPS. Let the idle 186 | # campaign alarm handle abandoned campaigns. 187 | enable_actions = campaign['minProvisionedTPS'] > 1 188 | actions_currently_enabled = False 189 | 190 | for alarm in response['MetricAlarms']: 191 | if (alarm['AlarmName'].startswith(ALARM_NAME_PREFIX_LOW_UTILIZATION) and 192 | alarm['ComparisonOperator'] in [ 'LessThanThreshold', 'LessThanOrEqualToThreshold' ]): 193 | alarm_name = alarm['AlarmName'] 194 | low_utilization_alarm_exists = True 195 | actions_currently_enabled = alarm['ActionsEnabled'] 196 | break 197 | 198 | alarm_created = False 199 | 200 | if not low_utilization_alarm_exists: 201 | logger.info('Creating lower bound utilization alarm for %s', campaign['campaignArn']) 202 | 203 | topic_arn = os.environ['NotificationsTopic'] 204 | 205 | cw.put_metric_alarm( 206 | AlarmName = alarm_name, 207 | AlarmDescription = 'Alarms when campaign utilization falls below threashold indicating possible over provisioning condition', 208 | ActionsEnabled = enable_actions, 209 | OKActions = [ topic_arn ], 210 | AlarmActions = [ topic_arn ], 211 | MetricName = 'campaignUtilization', 212 | Namespace = PROJECT_NAME, 213 | Statistic = 'Average', 214 | Dimensions = [ 215 | { 216 | 'Name': 'CampaignArn', 217 | 'Value': campaign['campaignArn'] 218 | } 219 | ], 220 | Period = ALARM_PERIOD_SECONDS, 221 | EvaluationPeriods = 12, # last 60 minutes 222 | DatapointsToAlarm = 9, # alarm state for 45 of last 60 minutes 223 | Threshold = utilization_threshold_lower_bound, 224 | ComparisonOperator = 'LessThanThreshold', 225 | TreatMissingData = 'missing', 226 | Tags=[ 227 | { 228 | 'Key': 'CreatedBy', 229 | 'Value': PROJECT_NAME 230 | } 231 | ] 232 | ) 233 | 234 | alarm_created = True 235 | elif enable_actions != actions_currently_enabled: 236 | # Toggle enable/disable actions for existing alarm. 237 | if enable_actions: 238 | cw.enable_alarm_actions(AlarmNames = [ alarm_name ]) 239 | else: 240 | cw.disable_alarm_actions(AlarmNames = [ alarm_name ]) 241 | 242 | return alarm_created 243 | 244 | def create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours): 245 | cw = get_client(service_name = 'cloudwatch', region_name = campaign_region) 246 | topic_arn = os.environ['NotificationsTopic'] 247 | 248 | metric_name = get_campaign_inference_metric_name(campaign) 249 | 250 | response = cw.describe_alarms_for_metric( 251 | MetricName = metric_name, 252 | Namespace = 'AWS/Personalize', 253 | Dimensions=[ 254 | { 255 | 'Name': 'CampaignArn', 256 | 'Value': campaign['campaignArn'] 257 | }, 258 | ] 259 | ) 260 | 261 | alarm_name = ALARM_NAME_PREFIX_IDLE + campaign['name'] 262 | 263 | idle_alarm_exists = False 264 | # Only enable actions when the campaign has existed at least as long as 265 | # the idle threshold. This is necessary since the alarm treats missing 266 | # data as breaching. 267 | enable_actions = get_campaign_age_hours(campaign) >= idle_campaign_threshold_hours 268 | actions_currently_enabled = False 269 | 270 | for alarm in response['MetricAlarms']: 271 | if (alarm['AlarmName'].startswith(ALARM_NAME_PREFIX_IDLE) and 272 | alarm['ComparisonOperator'] == 'LessThanOrEqualToThreshold' and 273 | int(alarm['Threshold']) == 0): 274 | alarm_name = alarm['AlarmName'] 275 | idle_alarm_exists = True 276 | actions_currently_enabled = alarm['ActionsEnabled'] 277 | break 278 | 279 | alarm_created = False 280 | 281 | if not idle_alarm_exists: 282 | logger.info('Creating idle utilization alarm for %s', campaign['campaignArn']) 283 | 284 | cw.put_metric_alarm( 285 | AlarmName = alarm_name, 286 | AlarmDescription = 'Alarms when campaign utilization is idle for continguous length of time indicating potential abandoned campaign', 287 | ActionsEnabled = enable_actions, 288 | OKActions = [ topic_arn ], 289 | AlarmActions = [ topic_arn ], 290 | MetricName = metric_name, 291 | Namespace = 'AWS/Personalize', 292 | Statistic = 'Sum', 293 | Dimensions = [ 294 | { 295 | 'Name': 'CampaignArn', 296 | 'Value': campaign['campaignArn'] 297 | } 298 | ], 299 | Period = ALARM_PERIOD_SECONDS, 300 | EvaluationPeriods = int(((60 * 60) / ALARM_PERIOD_SECONDS) * idle_campaign_threshold_hours), 301 | Threshold = 0, 302 | ComparisonOperator = 'LessThanOrEqualToThreshold', 303 | TreatMissingData = 'breaching', # Won't get metric data for idle campaigns 304 | Tags=[ 305 | { 306 | 'Key': 'CreatedBy', 307 | 'Value': PROJECT_NAME 308 | } 309 | ] 310 | ) 311 | 312 | alarm_created = True 313 | elif enable_actions != actions_currently_enabled: 314 | # Toggle enable/disable actions for existing alarm. 315 | if enable_actions: 316 | cw.enable_alarm_actions(AlarmNames = [ alarm_name ]) 317 | else: 318 | cw.disable_alarm_actions(AlarmNames = [ alarm_name ]) 319 | 320 | return alarm_created 321 | 322 | def divide_chunks(l, n): 323 | for i in range(0, len(l), n): 324 | yield l[i:i + n] 325 | 326 | def perform_hourly_checks(campaign_arn): 327 | ''' Hashes campaign_arn across 10 minute intervals of the current hour so we spread out campaign hourly checks ''' 328 | num_slots = 6 # 60 mins / 10 329 | slot = sum(bytearray(campaign_arn.encode('utf-8'))) % num_slots 330 | # Allow for match on first two minutes of 10 minute slot to account for CW event lag (assumes current schedule of every 5 mins). 331 | return datetime.datetime.now().minute in range(slot * 10, slot * 10 + 2) 332 | 333 | @logger.inject_lambda_context(log_event=True) 334 | def lambda_handler(event, context): 335 | auto_create_utilization_alarms = event.get('AutoCreateCampaignUtilizationAlarms') 336 | if not auto_create_utilization_alarms: 337 | auto_create_utilization_alarms = os.environ.get('AutoCreateCampaignUtilizationAlarms', 'yes').lower() in [ 'true', 'yes', '1' ] 338 | 339 | utilization_threshold_lower_bound = event.get('CampaignThresholdAlarmLowerBound') 340 | if not utilization_threshold_lower_bound: 341 | utilization_threshold_lower_bound = float(os.environ.get('CampaignThresholdAlarmLowerBound', '100.0')) 342 | 343 | auto_create_idle_alarms = event.get('AutoCreateIdleCampaignAlarms') 344 | if not auto_create_idle_alarms: 345 | auto_create_idle_alarms = os.environ.get('AutoCreateIdleCampaignAlarms', 'yes').lower() in [ 'true', 'yes', '1' ] 346 | 347 | auto_delete_idle_campaigns = event.get('AutoDeleteIdleCampaigns') 348 | if not auto_delete_idle_campaigns: 349 | auto_delete_idle_campaigns = os.environ.get('AutoDeleteIdleCampaigns', 'false').lower() in [ 'true', 'yes', '1' ] 350 | 351 | idle_campaign_threshold_hours = event.get('IdleCampaignThresholdHours') 352 | if not idle_campaign_threshold_hours: 353 | idle_campaign_threshold_hours = int(os.environ.get('IdleCampaignThresholdHours', '24')) 354 | 355 | if idle_campaign_threshold_hours < MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS: 356 | raise ValueError(f'"IdleCampaignThresholdHours" must be >= {MIN_IDLE_CAMPAIGN_THRESHOLD_HOURS} hours') 357 | 358 | auto_adjust_campaign_tps = event.get('AutoAdjustCampaignMinProvisionedTPS') 359 | if not auto_adjust_campaign_tps: 360 | auto_adjust_campaign_tps = os.environ.get('AutoAdjustCampaignMinProvisionedTPS', 'yes').lower() in [ 'true', 'yes', '1' ] 361 | 362 | campaigns = get_configured_active_campaigns(event) 363 | 364 | logger.info('Retrieving minProvisionedTPS for %d active campaigns', len(campaigns)) 365 | 366 | current_region = os.environ['AWS_REGION'] 367 | 368 | metric_datas_by_region = {} 369 | 370 | append_metric(metric_datas_by_region, current_region, { 371 | 'MetricName': 'monitoredCampaignCount', 372 | 'Value': len(campaigns), 373 | 'Unit': 'Count' 374 | }) 375 | 376 | campaign_metrics_written = 0 377 | all_metrics_written = 0 378 | alarms_created = 0 379 | 380 | # Define our 5 minute window, ensuring it's on prior 5 minute boundary. 381 | end_time = datetime.datetime.now(datetime.timezone.utc) 382 | end_time = end_time.replace(microsecond=0,second=0, minute=end_time.minute - end_time.minute % 5) 383 | start_time = end_time - datetime.timedelta(minutes=5) 384 | 385 | for campaign in campaigns: 386 | campaign_arn = campaign['campaignArn'] 387 | campaign_region = extract_region(campaign_arn) 388 | 389 | min_provisioned_tps = campaign['minProvisionedTPS'] 390 | 391 | append_metric(metric_datas_by_region, campaign_region, { 392 | 'MetricName': 'minProvisionedTPS', 393 | 'Dimensions': [ 394 | { 395 | 'Name': 'CampaignArn', 396 | 'Value': campaign_arn 397 | } 398 | ], 399 | 'Value': min_provisioned_tps, 400 | 'Unit': 'Count/Second' 401 | }) 402 | 403 | tps = get_campaign_average_tps(campaign, start_time, end_time) 404 | utilization = 0 405 | 406 | if tps: 407 | append_metric(metric_datas_by_region, campaign_region, { 408 | 'MetricName': 'averageTPS', 409 | 'Dimensions': [ 410 | { 411 | 'Name': 'CampaignArn', 412 | 'Value': campaign_arn 413 | } 414 | ], 415 | 'Value': tps, 416 | 'Unit': 'Count/Second' 417 | }) 418 | 419 | utilization = tps / min_provisioned_tps * 100 420 | 421 | append_metric(metric_datas_by_region, campaign_region, { 422 | 'MetricName': 'campaignUtilization', 423 | 'Dimensions': [ 424 | { 425 | 'Name': 'CampaignArn', 426 | 'Value': campaign_arn 427 | } 428 | ], 429 | 'Value': utilization, 430 | 'Unit': 'Percent' 431 | }) 432 | 433 | logger.debug( 434 | 'Campaign %s has current minProvisionedTPS of %d and actual TPS of %s yielding %.2f%% utilization', 435 | campaign_arn, min_provisioned_tps, tps, utilization 436 | ) 437 | campaign_metrics_written += 1 438 | 439 | # Only do idle campaign and minProvisionedTPS adjustment checks once per hour for each campaign. 440 | perform_hourly_checks_this_run = perform_hourly_checks(campaign_arn) 441 | 442 | # Determine how old the campaign is and time since last update. 443 | campaign_age_hours = get_campaign_age_hours(campaign) 444 | campaign_update_age_hours = get_campaign_last_update_age_hours(campaign) 445 | 446 | campaign_delete_event_fired = False 447 | 448 | if utilization == 0 and perform_hourly_checks_this_run and auto_delete_idle_campaigns: 449 | # Campaign is currently idle. Let's see if it's old enough and not being updated recently. 450 | logger.info( 451 | 'Performing idle delete check for campaign %s; campaign is %d hours old; last updated %s hours ago', 452 | campaign_arn, campaign_age_hours, campaign_update_age_hours 453 | ) 454 | 455 | if (campaign_age_hours >= idle_campaign_threshold_hours): 456 | 457 | # Campaign has been around long enough. Let's see how long it's been idle. 458 | end_time_idle_check = datetime.datetime.now(datetime.timezone.utc) 459 | start_time_idle_check = end_time_idle_check - datetime.timedelta(hours = idle_campaign_threshold_hours) 460 | period_idle_check = idle_campaign_threshold_hours * 60 * 60 461 | 462 | total_requests = get_campaign_total_requests(campaign, start_time_idle_check, end_time_idle_check, period_idle_check) 463 | 464 | if total_requests == 0: 465 | if is_campaign_updatable(campaign): 466 | reason = f'Campaign {campaign_arn} has been idle for at least {idle_campaign_threshold_hours} hours so initiating delete according to configuration.' 467 | 468 | logger.info(reason) 469 | 470 | put_event( 471 | detail_type = 'DeletePersonalizeCampaign', 472 | detail = json.dumps({ 473 | 'CampaignARN': campaign_arn, 474 | 'CampaignUtilization': utilization, 475 | 'CampaignAgeHours': campaign_age_hours, 476 | 'IdleCampaignThresholdHours': idle_campaign_threshold_hours, 477 | 'TotalRequestsDuringIdleThresholdHours': total_requests, 478 | 'Reason': reason 479 | }), 480 | resources = [ campaign_arn ] 481 | ) 482 | 483 | campaign_delete_event_fired = True 484 | else: 485 | logger.warn( 486 | 'Campaign %s has been idle for at least %d hours but its status will not allow it to be deleted on this run', 487 | campaign_arn, idle_campaign_threshold_hours 488 | ) 489 | else: 490 | logger.warn( 491 | 'Campaign %s is currently idle but has had %d requests within the last %d hours so does not meet idle criteria for auto-deletion', 492 | campaign_arn, total_requests, idle_campaign_threshold_hours 493 | ) 494 | else: 495 | logger.info( 496 | 'Campaign %s is only %d hours old and last update %s hours old; too new to consider for auto-deletion', 497 | campaign_arn, campaign_age_hours, campaign_update_age_hours 498 | ) 499 | 500 | if (not campaign_delete_event_fired and 501 | perform_hourly_checks_this_run and 502 | auto_adjust_campaign_tps and 503 | min_provisioned_tps > 1): 504 | 505 | days_back = 14 506 | end_time_tps_check = datetime.datetime.now(datetime.timezone.utc).replace(minute=0, second=0, microsecond=0) 507 | start_time_tps_check = end_time_tps_check - datetime.timedelta(days = days_back) 508 | 509 | datapoints = get_campaign_sum_requests_by_hour(campaign, start_time_tps_check, end_time_tps_check) 510 | min_reqs = sys.maxsize 511 | max_reqs = total_reqs = total_avg_tps = min_avg_tps = max_avg_tps = 0 512 | 513 | for datapoint in datapoints: 514 | total_reqs += datapoint['Value'] 515 | min_reqs = min(min_reqs, datapoint['Value']) 516 | max_reqs = max(max_reqs, datapoint['Value']) 517 | 518 | if len(datapoints) > 0: 519 | total_avg_tps = int(total_reqs / (len(datapoints) * 3600)) 520 | min_avg_tps = int(min_reqs / 3600) 521 | max_avg_tps = int(max_reqs / 3600) 522 | 523 | logger.info( 524 | 'Performing minProvisionedTPS adjustment check for campaign %s; min/max/avg hourly TPS over last %d days for %d datapoints: %d/%d/%.2f', 525 | campaign_arn, days_back, len(datapoints), min_avg_tps, max_avg_tps, total_avg_tps 526 | ) 527 | 528 | min_age_to_update_hours = 24 529 | 530 | age_eligible = True 531 | 532 | if campaign_age_hours < min_age_to_update_hours: 533 | logger.info( 534 | 'Campaign %s is less than %d hours old so not eligible for minProvisionedTPS adjustment yet', 535 | campaign_arn, min_age_to_update_hours 536 | ) 537 | age_eligible = False 538 | 539 | if age_eligible and min_avg_tps < min_provisioned_tps: 540 | # Incrementally drop minProvisionedTPS. 541 | new_min_tps = max(1, int(math.floor(min_provisioned_tps * .75))) 542 | 543 | if is_campaign_updatable(campaign): 544 | reason = f'Step down adjustment of minProvisionedTPS for campaign {campaign_arn} down from {min_provisioned_tps} to {new_min_tps} based on average hourly TPS low watermark of {min_avg_tps} over last {days_back} days' 545 | logger.info(reason) 546 | 547 | put_event( 548 | detail_type = 'UpdatePersonalizeCampaignMinProvisionedTPS', 549 | detail = json.dumps({ 550 | 'CampaignARN': campaign_arn, 551 | 'CampaignUtilization': utilization, 552 | 'CampaignAgeHours': campaign_age_hours, 553 | 'CurrentProvisionedTPS': min_provisioned_tps, 554 | 'MinProvisionedTPS': new_min_tps, 555 | 'MinAverageTPS': min_avg_tps, 556 | 'MaxAverageTPS': max_avg_tps, 557 | 'Datapoints': datapoints, 558 | 'Reason': reason 559 | }, default = str), 560 | resources = [ campaign_arn ] 561 | ) 562 | else: 563 | logger.warn( 564 | 'Campaign %s could have its minProvisionedTPS adjusted down from %d to %d based on average hourly TPS low watermark over last %d days but its status will not allow it to be updated on this run', 565 | campaign_arn, min_provisioned_tps, new_min_tps, days_back 566 | ) 567 | 568 | if not campaign_delete_event_fired: 569 | if auto_create_utilization_alarms: 570 | if create_utilization_alarm(campaign_region, campaign, utilization_threshold_lower_bound): 571 | alarms_created += 1 572 | 573 | if auto_create_idle_alarms: 574 | if create_idle_campaign_alarm(campaign_region, campaign, idle_campaign_threshold_hours): 575 | alarms_created += 1 576 | 577 | for region, metric_datas in metric_datas_by_region.items(): 578 | cw = get_client(service_name = 'cloudwatch', region_name = region) 579 | 580 | metric_datas_chunks = divide_chunks(metric_datas, MAX_METRICS_PER_CALL) 581 | 582 | for metrics_datas_chunk in metric_datas_chunks: 583 | put_metrics(cw, metrics_datas_chunk) 584 | all_metrics_written += len(metrics_datas_chunk) 585 | 586 | outcome = f'Logged {all_metrics_written} TPS utilization metrics for {campaign_metrics_written} active campaigns; {alarms_created} alarms created' 587 | logger.info(outcome) 588 | 589 | if alarms_created > 0: 590 | # At least one new alarm was created so that likely means new campaigns were created too. Let's trigger the dashboard to be rebuilt. 591 | logger.info('Triggering rebuild of the CloudWatch dashboard since %d new alarm(s) were created', alarms_created) 592 | put_event( 593 | detail_type = 'BuildPersonalizeMonitorDashboard', 594 | detail = json.dumps({ 595 | 'Reason': f'Triggered rebuild due to {alarms_created} new alarm(s) being created' 596 | }) 597 | ) 598 | 599 | return outcome 600 | --------------------------------------------------------------------------------