├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cloudformation ├── Makefile ├── requirements.txt ├── src │ ├── alarms.py │ ├── lambda.py │ └── metrics.py └── templates │ ├── alarms.json │ ├── lambda.json │ ├── lambda_standalone.json │ └── metrics.json ├── src └── chaos.py ├── test-requirements.txt └── test ├── base.py └── test_chaos.py /.gitignore: -------------------------------------------------------------------------------- 1 | .*.sw? 2 | *.py? 3 | chaos-lambda.zip 4 | venv 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 British Broadcasting Corporation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all test clean zip 2 | 3 | all: zip 4 | 5 | test: 6 | PYTHONPATH=src/ python3 -m unittest discover -v test/ 7 | 8 | clean: 9 | rm -f chaos-lambda.zip 10 | 11 | zip: chaos-lambda.zip 12 | 13 | chaos-lambda.zip: src/chaos.py 14 | zip -j $@ $^ 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | EC2 instances are volatile and can be recycled at any time without warning. 4 | Amazon recommends running them under Auto Scaling Groups to ensure overall 5 | service availability, but it's easy to forget that instances can suddenly fail 6 | until it happens in the early hours of the morning when everyone is on holiday. 7 | 8 | Chaos Lambda increases the rate at which these failures occur during business 9 | hours, helping teams to build services that handle them gracefully. 10 | 11 | 12 | # Quick setup 13 | 14 | Create the lambda function in the region you want it to target using the 15 | `cloudformation/templates/lambda_standalone.json` CloudFormation template. 16 | There are two parameters you may want to change: 17 | * `Schedule`: change if the default run times don't suit you (once per hour 18 | between 10am UTC and 4pm UTC, Monday to Friday); see 19 | http://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html 20 | for documentation on the syntax. 21 | * `DefaultProbability`: by default all Auto Scaling Groups in the region are 22 | targets; set this to `0.0` and only ASGs with a `chaos-lambda-termination` 23 | tag (see below) will be affected. 24 | 25 | 26 | # Notifications 27 | 28 | ## Termination Topic 29 | 30 | By deploying the `lambda_standalone.json` CloudFormation template, an SNS topic 31 | will be created with the name `ChaosLambdaTerminationTopic`. For each instance 32 | that gets terminated, a notification will be published using this structure: 33 | 34 | ```json 35 | { 36 | "event_name": "chaos_lambda.terminating", 37 | "asg_name": "my-autoscaling-group", 38 | "instance_id": "i-00001234" 39 | } 40 | ``` 41 | 42 | By default, no subscriptions are created to this topic, so it is up to you to 43 | subscribe a queue or another lambda if you wish. 44 | 45 | ## Failure topic 46 | 47 | To receive notifications if the lambda function fails for any reason, create 48 | another stack using the `cloudformation/templates/alarms.json` template. This 49 | takes the lambda function name (something similar to 50 | `chaos-lambda-ChaosLambdaFunction-EM2XNWWNZTPW`) and the email address to 51 | send the alerts to. 52 | 53 | 54 | # Probability of termination 55 | 56 | Every time the lambda triggers it examines all the Auto Scaling Groups in the 57 | region and potentially terminates one instance in each. The probability of 58 | termination can be changed at the ASG level with a tag, and at a global level 59 | with the `DefaultProbability` stack parameter. 60 | 61 | At the ASG level the probability can be controlled by adding a 62 | `chaos-lambda-termination` tag with a value between `0.0` (never terminate) and 63 | `1.0` (always terminate). Typically this would be used to opt out a legacy 64 | system (`0.0`). 65 | 66 | The `DefaultProbability` parameter sets the probability of termination for any 67 | ASG _without_ a valid `chaos-lambda-termination` tag. If set to `0.0` the 68 | system becomes "opt-in", where any ASG without this tag is ignored. The 69 | default is `0.166` (or 1 in 6). 70 | 71 | 72 | # Enabling/disabling 73 | 74 | The lambda is triggered by a CloudWatch Events rule, the name of which can be 75 | found from the `ChaosLambdaFunctionOutput` output of the lambda stack. Locate 76 | this rule in the AWS console under the Rules section of the CloudWatch service, 77 | and you can disable or enable it via the `Actions` button. 78 | 79 | 80 | # Regions 81 | 82 | By default the lambda will target ASGs running in the same region. It's 83 | generally a good idea to avoid cross-region actions, but if necessary an 84 | alternative list of one or more region names can be specified in the `Regions` 85 | stack parameter. 86 | 87 | The value is a comma separated list of region names with optional whitespace, 88 | so the following are all valid and equivalent: 89 | * `ap-south-1,eu-west-1,us-east-1` 90 | * `ap-south-1, eu-west-1, us-east-1` 91 | * `ap-south-1 , eu-west-1 , us-east-1` 92 | 93 | 94 | # Log messages 95 | 96 | Chaos Lambda log lines always start with a timestamp and a word specifying the 97 | event type. The timestamp is of the form `YYYY-MM-DDThh:mm:ssZ`, eg 98 | `2015-12-11T14:00:37Z`, and the timezone will always be `Z`. The different 99 | event types are described below. 100 | 101 | ## bad-probability 102 | 103 | ` bad-probability [] in ` 104 | 105 | Example: 106 | 107 | `2015-12-11T14:07:21Z bad-probability [not often] in test-app-ASG-7LJI5SY4VX6T` 108 | 109 | If the value of the `chaos-lambda-termination` tag isn't a number between `0.0` 110 | and `1.0` inclusive then it will be logged in one of these lines. The square 111 | brackets around the value allow CloudWatch Logs to find the full value even if 112 | it contains spaces. 113 | 114 | ## result 115 | 116 | ` result is ` 117 | 118 | Example: 119 | 120 | `2015-12-11T14:00:40Z result i-fe705d77 is shutting-down` 121 | 122 | After asking EC2 to terminate each of the targeted instances the new state of 123 | each is logged with a `result` line. The `` value is taken from the 124 | `code` property of the `InstanceState` AWS type described at 125 | http://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_InstanceState.html 126 | 127 | ## targeting 128 | 129 | ` targeting in ` 130 | 131 | Example: 132 | 133 | `2015-12-11T14:00:38Z targeting i-168f9eaf in test-app-ASG-1LOMEKEVBXXXS` 134 | 135 | The `targeting` lines list all of the instances that are about to be 136 | terminated, before the `TerminateInstances` call occurs. 137 | 138 | ## triggered 139 | 140 | ` triggered ` 141 | 142 | Example: 143 | 144 | `2015-12-11T14:00:37Z triggered eu-west-1` 145 | 146 | Generated when the lambda is triggered, indicating the region that will be 147 | affected. 148 | -------------------------------------------------------------------------------- /cloudformation/Makefile: -------------------------------------------------------------------------------- 1 | SOURCES := $(shell echo src/*.py) 2 | TARGETS := $(patsubst src/%.py,templates/%.json,$(SOURCES)) templates/lambda_standalone.json 3 | 4 | CHAOS_PY := ../src/chaos.py 5 | 6 | all: $(TARGETS) 7 | 8 | clean: 9 | rm -f $(TARGETS) 10 | 11 | templates/lambda_standalone.json: src/lambda.py $(CHAOS_PY) venv 12 | venv/bin/python $< $@ $(CHAOS_PY) 13 | 14 | templates/%.json: src/%.py venv 15 | venv/bin/python $< $@ 16 | 17 | venv: requirements.txt 18 | rm -rf $@ 19 | (python3 -m venv $@ && $@/bin/pip install -r $<) || rm -rf $@ 20 | -------------------------------------------------------------------------------- /cloudformation/requirements.txt: -------------------------------------------------------------------------------- 1 | troposphere == 4.1.0 2 | -------------------------------------------------------------------------------- /cloudformation/src/alarms.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from troposphere import Parameter, Ref, Template 4 | from troposphere.cloudwatch import Alarm, MetricDimension 5 | from troposphere.sns import Subscription, Topic 6 | 7 | 8 | METRIC_NAMESPACE = "BBC/CHAOS-LAMBDA" 9 | 10 | t = Template() 11 | 12 | t.set_description("Chaos Lambda alarms") 13 | 14 | alarm_email = t.add_parameter( 15 | Parameter( 16 | "ChaosLambdaAlarmEmail", 17 | Description="Email address to notify if there are any " 18 | "operational issues", 19 | Type="String", 20 | ) 21 | ) 22 | 23 | lambda_function_name = t.add_parameter( 24 | Parameter( 25 | "LambdaFunctionName", 26 | Description="The name of the lambda function", 27 | Type="String", 28 | ) 29 | ) 30 | 31 | alarm_topic = t.add_resource( 32 | Topic( 33 | "ChaosLambdaAlarmTopic", 34 | Subscription=[ 35 | Subscription( 36 | Endpoint=Ref(alarm_email), 37 | Protocol="email" 38 | ), 39 | ], 40 | ) 41 | ) 42 | 43 | t.add_resource( 44 | Alarm( 45 | "ChaosLambdaErrorAlarm", 46 | AlarmName="chaosLambda/LambdaError", 47 | AlarmDescription="Enters ALARM state because we have received a " 48 | "lamdba error. See 'Errors' section on the following " 49 | "link: http://docs.aws.amazon.com/lambda/latest/dg/" 50 | "monitoring-functions-metrics.html for more " 51 | "information.", 52 | Namespace="AWS/Lambda", 53 | MetricName="Errors", 54 | Dimensions=[ 55 | MetricDimension( 56 | Name="FunctionName", 57 | Value=Ref(lambda_function_name) 58 | ), 59 | ], 60 | Statistic="Sum", 61 | Period="60", 62 | EvaluationPeriods="1", 63 | Threshold="1", 64 | Unit="Count", 65 | ComparisonOperator="GreaterThanOrEqualToThreshold", 66 | AlarmActions=[Ref(alarm_topic), ], 67 | ) 68 | ) 69 | 70 | t.add_resource( 71 | Alarm( 72 | "ChaosLambdaDurationAlarm", 73 | AlarmName="chaosLambda/LambdaDuration", 74 | AlarmDescription="Enters ALARM state because we have functions taking " 75 | "longer than expected. Please adjust the available " 76 | "lambda process time accordingly, then replay any " 77 | " failed events. See 'Duration' section on the " 78 | " following link: " 79 | "http://docs.aws.amazon.com/lambda/latest/dg/" 80 | "monitoring-functions-metrics.html for more " 81 | "information.", 82 | Namespace="AWS/Lambda", 83 | MetricName="Duration", 84 | Dimensions=[ 85 | MetricDimension( 86 | Name="FunctionName", 87 | Value=Ref(lambda_function_name) 88 | ), 89 | ], 90 | Statistic="Maximum", 91 | Period="60", 92 | EvaluationPeriods="1", 93 | Threshold="7000", 94 | Unit="Milliseconds", 95 | ComparisonOperator="GreaterThanThreshold", 96 | AlarmActions=[Ref(alarm_topic), ], 97 | ) 98 | ) 99 | 100 | ''' 101 | t.add_resource( 102 | Alarm( 103 | "Liveliness", 104 | AlarmName="chaosLambda/Liveliness", 105 | AlarmDescription="Enters ALARM state if the Chaos Lambda hasn't " 106 | "triggered within a seven day window.", 107 | Namespace=METRIC_NAMESPACE, 108 | MetricName="triggered", 109 | EvaluationPeriods="1", 110 | Period="604800", 111 | Statistic="SampleCount", 112 | ComparisonOperator="LessThanThreshold", 113 | Threshold="1", 114 | Unit="None", 115 | AlarmActions=[Ref(alarm_topic)], 116 | ) 117 | ) 118 | ''' 119 | 120 | template = t.to_json(indent=4) 121 | if len(sys.argv) > 1: 122 | open(sys.argv[1], "w").write(template + "\n") 123 | else: 124 | print(template) 125 | -------------------------------------------------------------------------------- /cloudformation/src/lambda.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | from troposphere import GetAtt, Output, Parameter, Ref, Sub, Template 5 | from troposphere.awslambda import Code, Environment, Function, Permission 6 | from troposphere.logs import LogGroup 7 | from troposphere.iam import Role, Policy 8 | from troposphere.events import Rule, Target 9 | from troposphere.sns import Topic 10 | 11 | 12 | if len(sys.argv) > 2: 13 | source = open(sys.argv[2], "r").read() 14 | # Reclaim a few bytes (maximum size is 4096!) by converting four space 15 | # indents to single space indents 16 | indent_re = re.compile(r"^((?: ){1,})", re.MULTILINE) 17 | source = indent_re.sub(lambda m: " " * (len(m.group(1)) // 4), source) 18 | else: 19 | source = None 20 | 21 | 22 | t = Template() 23 | t.set_description("Chaos Lambda") 24 | 25 | if source is None: 26 | s3_bucket = t.add_parameter(Parameter( 27 | "S3Bucket", 28 | Description="Name of the S3 bucket containing the Lambda zip file", 29 | Type="String", 30 | )) 31 | s3_key = t.add_parameter(Parameter( 32 | "S3Key", 33 | Description="Path to the Lambda zip file under the bucket", 34 | Default="chaos-lambda.zip", 35 | Type="String", 36 | )) 37 | lambda_code = Code(S3Bucket=Ref(s3_bucket), S3Key=Ref(s3_key)) 38 | module_name = "chaos" 39 | else: 40 | lambda_code = Code(ZipFile=source) 41 | module_name = "index" 42 | 43 | chaos_schedule = t.add_parameter(Parameter( 44 | "Schedule", 45 | Description="Schedule on which to run (UTC time zone)", 46 | Default="cron(0 10-16 ? * MON-FRI *)", 47 | Type="String" 48 | )) 49 | 50 | default_probability = t.add_parameter(Parameter( 51 | "DefaultProbability", 52 | Description="Default termination probability", 53 | Default=1.0 / 6.0, 54 | MinValue=0.0, 55 | MaxValue=1.0, 56 | Type="Number" 57 | )) 58 | 59 | regions = t.add_parameter(Parameter( 60 | "Regions", 61 | Description="Override default region with comma-separated list of regions", 62 | Type="String" 63 | )) 64 | 65 | log_retention_period = t.add_parameter(Parameter( 66 | "LogRetentionPeriod", 67 | Description="Log retention period", 68 | Default=90, 69 | Type="Number" 70 | )) 71 | 72 | termination_topic = t.add_resource( 73 | Topic("ChaosLambdaTerminationTopic") 74 | ) 75 | 76 | lambda_policy = Policy( 77 | PolicyName="ChaosLambdaPolicy", 78 | PolicyDocument={ 79 | "Version": "2012-10-17", 80 | "Statement": [ 81 | { 82 | "Effect": "Allow", 83 | "Action": [ 84 | "logs:CreateLogGroup", 85 | "logs:CreateLogStream", 86 | "logs:PutLogEvents" 87 | ], 88 | "Resource": "arn:aws:logs:*:*:*" 89 | }, 90 | { 91 | "Effect": "Allow", 92 | "Action": [ 93 | "ses:SendEmail", 94 | "ec2:TerminateInstances", 95 | "autoscaling:DescribeAutoScalingGroups" 96 | ], 97 | "Resource": "*" 98 | }, 99 | { 100 | "Effect": "Allow", 101 | "Action": [ 102 | "sns:Publish" 103 | ], 104 | "Resource": Ref(termination_topic) 105 | } 106 | ] 107 | } 108 | ) 109 | 110 | lambda_role = Role( 111 | "ChaosLambdaRole", 112 | AssumeRolePolicyDocument={ 113 | "Version": "2012-10-17", 114 | "Statement": [{ 115 | "Effect": "Allow", 116 | "Principal": { 117 | "Service": ["lambda.amazonaws.com"] 118 | }, 119 | "Action": ["sts:AssumeRole"] 120 | }] 121 | }, 122 | Path="/lambda/", 123 | Policies=[lambda_policy] 124 | ) 125 | t.add_resource(lambda_role) 126 | 127 | lambda_log_group = t.add_resource(LogGroup( 128 | "ChaosLambdaLogGroup", 129 | LogGroupName=Sub("/aws/lambda/${AWS::StackName}-function"), 130 | RetentionInDays=Ref(log_retention_period), 131 | )) 132 | 133 | lambda_function = t.add_resource(Function( 134 | "ChaosLambdaFunction", 135 | Description="CloudFormation Lambda", 136 | FunctionName=Sub("${AWS::StackName}-function"), 137 | Code=lambda_code, 138 | Environment=Environment(Variables={ 139 | "probability": Ref(default_probability), 140 | "regions": Ref(regions), 141 | "termination_topic_arn": Ref(termination_topic), 142 | }), 143 | Handler=module_name + ".handler", 144 | MemorySize=128, 145 | Role=GetAtt(lambda_role, "Arn"), 146 | Runtime="python3.11", 147 | Timeout=30, 148 | DependsOn=lambda_log_group.title 149 | )) 150 | 151 | chaos_lambda_rule = t.add_resource(Rule( 152 | "ChaosLambdaRule", 153 | Description="Trigger Chaos Lambda according to a schedule", 154 | State="ENABLED", 155 | ScheduleExpression=Ref(chaos_schedule), 156 | Targets=[ 157 | Target(Arn=GetAtt(lambda_function, "Arn"), Id="ChaosLambdaRuleTarget") 158 | ] 159 | )) 160 | t.add_resource(Permission( 161 | "ChaosLambdaRulePermission", 162 | FunctionName=GetAtt(lambda_function, "Arn"), 163 | SourceArn=GetAtt(chaos_lambda_rule, "Arn"), 164 | Principal="events.amazonaws.com", 165 | Action="lambda:InvokeFunction" 166 | )) 167 | 168 | t.add_output(Output( 169 | "ChaosLambdaFunctionOutput", 170 | Value=Ref(lambda_function), 171 | Description="The Chaos Lambda Function" 172 | )) 173 | t.add_output(Output( 174 | "ChaosLambdaRuleOutput", 175 | Value=Ref(chaos_lambda_rule), 176 | Description="Rule used to trigger the Chaos Lambda" 177 | )) 178 | 179 | template = t.to_json(indent=4) 180 | if len(sys.argv) > 1: 181 | open(sys.argv[1], "w").write(template + "\n") 182 | else: 183 | print(template) 184 | -------------------------------------------------------------------------------- /cloudformation/src/metrics.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from troposphere import Template, Ref, Parameter 4 | from troposphere.logs import MetricFilter, MetricTransformation 5 | 6 | METRIC_NAMESPACE = "BBC/CHAOS-LAMBDA" 7 | 8 | t = Template() 9 | 10 | log_group = t.add_parameter( 11 | Parameter( 12 | "LambdaLogGroupName", 13 | Description="The name of the log group for the lambda function.", 14 | Type="String", 15 | ) 16 | ) 17 | 18 | t.set_description( 19 | "Metrics and filters for Chaos Lambda" 20 | ) 21 | 22 | lambda_metrics = { 23 | "liveliness": { 24 | "FilterPattern": ( 25 | "[datetime, event=\"triggered\", ...]" 26 | ), 27 | "MetricTransformations": [ 28 | MetricTransformation( 29 | MetricNamespace=METRIC_NAMESPACE, 30 | MetricName="triggered", 31 | MetricValue="1", 32 | ) 33 | ] 34 | } 35 | } 36 | 37 | for name, metric in lambda_metrics.items(): 38 | metric["LogGroupName"] = Ref(log_group) 39 | t.add_resource(MetricFilter(name, **metric)) 40 | 41 | template = t.to_json(indent=4) 42 | if len(sys.argv) > 1: 43 | open(sys.argv[1], "w").write(template + "\n") 44 | else: 45 | print(template) 46 | -------------------------------------------------------------------------------- /cloudformation/templates/alarms.json: -------------------------------------------------------------------------------- 1 | { 2 | "Description": "Chaos Lambda alarms", 3 | "Parameters": { 4 | "ChaosLambdaAlarmEmail": { 5 | "Description": "Email address to notify if there are any operational issues", 6 | "Type": "String" 7 | }, 8 | "LambdaFunctionName": { 9 | "Description": "The name of the lambda function", 10 | "Type": "String" 11 | } 12 | }, 13 | "Resources": { 14 | "ChaosLambdaAlarmTopic": { 15 | "Properties": { 16 | "Subscription": [ 17 | { 18 | "Endpoint": { 19 | "Ref": "ChaosLambdaAlarmEmail" 20 | }, 21 | "Protocol": "email" 22 | } 23 | ] 24 | }, 25 | "Type": "AWS::SNS::Topic" 26 | }, 27 | "ChaosLambdaDurationAlarm": { 28 | "Properties": { 29 | "AlarmActions": [ 30 | { 31 | "Ref": "ChaosLambdaAlarmTopic" 32 | } 33 | ], 34 | "AlarmDescription": "Enters ALARM state because we have functions taking longer than expected. Please adjust the available lambda process time accordingly, then replay any failed events. See 'Duration' section on the following link: http://docs.aws.amazon.com/lambda/latest/dg/monitoring-functions-metrics.html for more information.", 35 | "AlarmName": "chaosLambda/LambdaDuration", 36 | "ComparisonOperator": "GreaterThanThreshold", 37 | "Dimensions": [ 38 | { 39 | "Name": "FunctionName", 40 | "Value": { 41 | "Ref": "LambdaFunctionName" 42 | } 43 | } 44 | ], 45 | "EvaluationPeriods": "1", 46 | "MetricName": "Duration", 47 | "Namespace": "AWS/Lambda", 48 | "Period": "60", 49 | "Statistic": "Maximum", 50 | "Threshold": "7000", 51 | "Unit": "Milliseconds" 52 | }, 53 | "Type": "AWS::CloudWatch::Alarm" 54 | }, 55 | "ChaosLambdaErrorAlarm": { 56 | "Properties": { 57 | "AlarmActions": [ 58 | { 59 | "Ref": "ChaosLambdaAlarmTopic" 60 | } 61 | ], 62 | "AlarmDescription": "Enters ALARM state because we have received a lamdba error. See 'Errors' section on the following link: http://docs.aws.amazon.com/lambda/latest/dg/monitoring-functions-metrics.html for more information.", 63 | "AlarmName": "chaosLambda/LambdaError", 64 | "ComparisonOperator": "GreaterThanOrEqualToThreshold", 65 | "Dimensions": [ 66 | { 67 | "Name": "FunctionName", 68 | "Value": { 69 | "Ref": "LambdaFunctionName" 70 | } 71 | } 72 | ], 73 | "EvaluationPeriods": "1", 74 | "MetricName": "Errors", 75 | "Namespace": "AWS/Lambda", 76 | "Period": "60", 77 | "Statistic": "Sum", 78 | "Threshold": "1", 79 | "Unit": "Count" 80 | }, 81 | "Type": "AWS::CloudWatch::Alarm" 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /cloudformation/templates/lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Description": "Chaos Lambda", 3 | "Outputs": { 4 | "ChaosLambdaFunctionOutput": { 5 | "Description": "The Chaos Lambda Function", 6 | "Value": { 7 | "Ref": "ChaosLambdaFunction" 8 | } 9 | }, 10 | "ChaosLambdaRuleOutput": { 11 | "Description": "Rule used to trigger the Chaos Lambda", 12 | "Value": { 13 | "Ref": "ChaosLambdaRule" 14 | } 15 | } 16 | }, 17 | "Parameters": { 18 | "DefaultProbability": { 19 | "Default": 0.16666666666666666, 20 | "Description": "Default termination probability", 21 | "MaxValue": 1.0, 22 | "MinValue": 0.0, 23 | "Type": "Number" 24 | }, 25 | "LogRetentionPeriod": { 26 | "Default": 90, 27 | "Description": "Log retention period", 28 | "Type": "Number" 29 | }, 30 | "Regions": { 31 | "Description": "Override default region with comma-separated list of regions", 32 | "Type": "String" 33 | }, 34 | "S3Bucket": { 35 | "Description": "Name of the S3 bucket containing the Lambda zip file", 36 | "Type": "String" 37 | }, 38 | "S3Key": { 39 | "Default": "chaos-lambda.zip", 40 | "Description": "Path to the Lambda zip file under the bucket", 41 | "Type": "String" 42 | }, 43 | "Schedule": { 44 | "Default": "cron(0 10-16 ? * MON-FRI *)", 45 | "Description": "Schedule on which to run (UTC time zone)", 46 | "Type": "String" 47 | } 48 | }, 49 | "Resources": { 50 | "ChaosLambdaFunction": { 51 | "DependsOn": "ChaosLambdaLogGroup", 52 | "Properties": { 53 | "Code": { 54 | "S3Bucket": { 55 | "Ref": "S3Bucket" 56 | }, 57 | "S3Key": { 58 | "Ref": "S3Key" 59 | } 60 | }, 61 | "Description": "CloudFormation Lambda", 62 | "Environment": { 63 | "Variables": { 64 | "probability": { 65 | "Ref": "DefaultProbability" 66 | }, 67 | "regions": { 68 | "Ref": "Regions" 69 | }, 70 | "termination_topic_arn": { 71 | "Ref": "ChaosLambdaTerminationTopic" 72 | } 73 | } 74 | }, 75 | "FunctionName": { 76 | "Fn::Sub": "${AWS::StackName}-function" 77 | }, 78 | "Handler": "chaos.handler", 79 | "MemorySize": 128, 80 | "Role": { 81 | "Fn::GetAtt": [ 82 | "ChaosLambdaRole", 83 | "Arn" 84 | ] 85 | }, 86 | "Runtime": "python3.11", 87 | "Timeout": 30 88 | }, 89 | "Type": "AWS::Lambda::Function" 90 | }, 91 | "ChaosLambdaLogGroup": { 92 | "Properties": { 93 | "LogGroupName": { 94 | "Fn::Sub": "/aws/lambda/${AWS::StackName}-function" 95 | }, 96 | "RetentionInDays": { 97 | "Ref": "LogRetentionPeriod" 98 | } 99 | }, 100 | "Type": "AWS::Logs::LogGroup" 101 | }, 102 | "ChaosLambdaRole": { 103 | "Properties": { 104 | "AssumeRolePolicyDocument": { 105 | "Statement": [ 106 | { 107 | "Action": [ 108 | "sts:AssumeRole" 109 | ], 110 | "Effect": "Allow", 111 | "Principal": { 112 | "Service": [ 113 | "lambda.amazonaws.com" 114 | ] 115 | } 116 | } 117 | ], 118 | "Version": "2012-10-17" 119 | }, 120 | "Path": "/lambda/", 121 | "Policies": [ 122 | { 123 | "PolicyDocument": { 124 | "Statement": [ 125 | { 126 | "Action": [ 127 | "logs:CreateLogGroup", 128 | "logs:CreateLogStream", 129 | "logs:PutLogEvents" 130 | ], 131 | "Effect": "Allow", 132 | "Resource": "arn:aws:logs:*:*:*" 133 | }, 134 | { 135 | "Action": [ 136 | "ses:SendEmail", 137 | "ec2:TerminateInstances", 138 | "autoscaling:DescribeAutoScalingGroups" 139 | ], 140 | "Effect": "Allow", 141 | "Resource": "*" 142 | }, 143 | { 144 | "Action": [ 145 | "sns:Publish" 146 | ], 147 | "Effect": "Allow", 148 | "Resource": { 149 | "Ref": "ChaosLambdaTerminationTopic" 150 | } 151 | } 152 | ], 153 | "Version": "2012-10-17" 154 | }, 155 | "PolicyName": "ChaosLambdaPolicy" 156 | } 157 | ] 158 | }, 159 | "Type": "AWS::IAM::Role" 160 | }, 161 | "ChaosLambdaRule": { 162 | "Properties": { 163 | "Description": "Trigger Chaos Lambda according to a schedule", 164 | "ScheduleExpression": { 165 | "Ref": "Schedule" 166 | }, 167 | "State": "ENABLED", 168 | "Targets": [ 169 | { 170 | "Arn": { 171 | "Fn::GetAtt": [ 172 | "ChaosLambdaFunction", 173 | "Arn" 174 | ] 175 | }, 176 | "Id": "ChaosLambdaRuleTarget" 177 | } 178 | ] 179 | }, 180 | "Type": "AWS::Events::Rule" 181 | }, 182 | "ChaosLambdaRulePermission": { 183 | "Properties": { 184 | "Action": "lambda:InvokeFunction", 185 | "FunctionName": { 186 | "Fn::GetAtt": [ 187 | "ChaosLambdaFunction", 188 | "Arn" 189 | ] 190 | }, 191 | "Principal": "events.amazonaws.com", 192 | "SourceArn": { 193 | "Fn::GetAtt": [ 194 | "ChaosLambdaRule", 195 | "Arn" 196 | ] 197 | } 198 | }, 199 | "Type": "AWS::Lambda::Permission" 200 | }, 201 | "ChaosLambdaTerminationTopic": { 202 | "Type": "AWS::SNS::Topic" 203 | } 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /cloudformation/templates/lambda_standalone.json: -------------------------------------------------------------------------------- 1 | { 2 | "Description": "Chaos Lambda", 3 | "Outputs": { 4 | "ChaosLambdaFunctionOutput": { 5 | "Description": "The Chaos Lambda Function", 6 | "Value": { 7 | "Ref": "ChaosLambdaFunction" 8 | } 9 | }, 10 | "ChaosLambdaRuleOutput": { 11 | "Description": "Rule used to trigger the Chaos Lambda", 12 | "Value": { 13 | "Ref": "ChaosLambdaRule" 14 | } 15 | } 16 | }, 17 | "Parameters": { 18 | "DefaultProbability": { 19 | "Default": 0.16666666666666666, 20 | "Description": "Default termination probability", 21 | "MaxValue": 1.0, 22 | "MinValue": 0.0, 23 | "Type": "Number" 24 | }, 25 | "LogRetentionPeriod": { 26 | "Default": 90, 27 | "Description": "Log retention period", 28 | "Type": "Number" 29 | }, 30 | "Regions": { 31 | "Description": "Override default region with comma-separated list of regions", 32 | "Type": "String" 33 | }, 34 | "Schedule": { 35 | "Default": "cron(0 10-16 ? * MON-FRI *)", 36 | "Description": "Schedule on which to run (UTC time zone)", 37 | "Type": "String" 38 | } 39 | }, 40 | "Resources": { 41 | "ChaosLambdaFunction": { 42 | "DependsOn": "ChaosLambdaLogGroup", 43 | "Properties": { 44 | "Code": { 45 | "ZipFile": "import json\nimport os\nimport random\nimport time\n\nimport boto3\n\n\nPROBABILITY_TAG = \"chaos-lambda-termination\"\nDEFAULT_PROBABILITY = 1.0 / 6.0\n\n\ndef log(*args):\n timestamp = time.strftime(\"%Y-%m-%dT%H:%M:%SZ\", time.gmtime())\n print(timestamp, *args)\n\n\ndef get_asg_tag(asg, name, default=None):\n name = name.lower()\n for tag in asg.get(\"Tags\", []):\n if tag.get(\"Key\", \"\").lower() == name:\n return tag.get(\"Value\", \"\")\n return default\n\n\ndef safe_float(s, default):\n try:\n return float(s)\n except ValueError:\n return default\n\n\ndef get_asg_probability(asg, default):\n value = get_asg_tag(asg, PROBABILITY_TAG, None)\n if value is None:\n return default\n\n probability = safe_float(value, None)\n if probability is not None and 0.0 <= probability <= 1.0:\n return probability\n\n asg_name = asg[\"AutoScalingGroupName\"]\n log(\"bad-probability\", \"[\" + value + \"]\", \"in\", asg_name)\n return default\n\n\ndef get_asg_instance_id(asg, default):\n instances = asg.get(\"Instances\", [])\n if len(instances) == 0:\n return None\n\n probability = get_asg_probability(asg, default)\n if random.random() >= probability:\n return None\n else:\n return random.choice(instances).get(\"InstanceId\", None)\n\n\ndef get_all_asgs(autoscaling):\n paginator = autoscaling.get_paginator(\"describe_auto_scaling_groups\")\n for response in paginator.paginate():\n for asg in response.get(\"AutoScalingGroups\", []):\n yield asg\n\n\ndef get_targets(autoscaling, default_probability):\n targets = []\n for asg in get_all_asgs(autoscaling):\n instance_id = get_asg_instance_id(asg, default_probability)\n if instance_id is not None:\n targets.append((asg[\"AutoScalingGroupName\"], instance_id))\n return targets\n\n\ndef send_notification(sns, instance_id, asg_name):\n topic = os.environ.get(\"termination_topic_arn\", \"\").strip()\n if topic == '':\n return\n notification = {\n \"event_name\": \"chaos_lambda.terminating\",\n \"instance_id\": instance_id,\n \"asg_name\": asg_name,\n }\n sns.publish(\n TopicArn=topic,\n Message=json.dumps(notification)\n )\n\n\ndef terminate_targets(ec2, sns, targets):\n for asg_name, instance_id in targets:\n log(\"targeting\", instance_id, \"in\", asg_name)\n try:\n send_notification(sns, instance_id, asg_name)\n except Exception as e:\n log(\"Failed to send notification\", e)\n\n instance_ids = [instance_id for (asg_name, instance_id) in targets]\n response = ec2.terminate_instances(InstanceIds=instance_ids)\n\n results = []\n for i in response.get(\"TerminatingInstances\", []):\n results.append((i[\"InstanceId\"], i[\"CurrentState\"][\"Name\"]))\n\n for instance_id, state in results:\n log(\"result\", instance_id, \"is\", state)\n\n return results\n\n\ndef chaos_lambda(regions, default_probability):\n for region in regions:\n log(\"triggered\", region)\n autoscaling = boto3.client(\"autoscaling\", region_name=region)\n targets = get_targets(autoscaling, default_probability)\n if len(targets) != 0:\n ec2 = boto3.client(\"ec2\", region_name=region)\n sns = boto3.client(\"sns\", region_name=region)\n terminate_targets(ec2, sns, targets)\n\n\ndef get_regions(context):\n v = os.environ.get(\"regions\", \"\").strip()\n if len(v) == 0:\n return [context.invoked_function_arn.split(\":\")[3]]\n else:\n return list(filter(None, [s.strip() for s in v.split(\",\")]))\n\n\ndef get_default_probability():\n v = os.environ.get(\"probability\", \"\").strip()\n if len(v) == 0:\n return DEFAULT_PROBABILITY\n else:\n return float(v)\n\n\ndef handler(event, context):\n regions = get_regions(context)\n probability = get_default_probability()\n chaos_lambda(regions, probability)\n" 46 | }, 47 | "Description": "CloudFormation Lambda", 48 | "Environment": { 49 | "Variables": { 50 | "probability": { 51 | "Ref": "DefaultProbability" 52 | }, 53 | "regions": { 54 | "Ref": "Regions" 55 | }, 56 | "termination_topic_arn": { 57 | "Ref": "ChaosLambdaTerminationTopic" 58 | } 59 | } 60 | }, 61 | "FunctionName": { 62 | "Fn::Sub": "${AWS::StackName}-function" 63 | }, 64 | "Handler": "index.handler", 65 | "MemorySize": 128, 66 | "Role": { 67 | "Fn::GetAtt": [ 68 | "ChaosLambdaRole", 69 | "Arn" 70 | ] 71 | }, 72 | "Runtime": "python3.11", 73 | "Timeout": 30 74 | }, 75 | "Type": "AWS::Lambda::Function" 76 | }, 77 | "ChaosLambdaLogGroup": { 78 | "Properties": { 79 | "LogGroupName": { 80 | "Fn::Sub": "/aws/lambda/${AWS::StackName}-function" 81 | }, 82 | "RetentionInDays": { 83 | "Ref": "LogRetentionPeriod" 84 | } 85 | }, 86 | "Type": "AWS::Logs::LogGroup" 87 | }, 88 | "ChaosLambdaRole": { 89 | "Properties": { 90 | "AssumeRolePolicyDocument": { 91 | "Statement": [ 92 | { 93 | "Action": [ 94 | "sts:AssumeRole" 95 | ], 96 | "Effect": "Allow", 97 | "Principal": { 98 | "Service": [ 99 | "lambda.amazonaws.com" 100 | ] 101 | } 102 | } 103 | ], 104 | "Version": "2012-10-17" 105 | }, 106 | "Path": "/lambda/", 107 | "Policies": [ 108 | { 109 | "PolicyDocument": { 110 | "Statement": [ 111 | { 112 | "Action": [ 113 | "logs:CreateLogGroup", 114 | "logs:CreateLogStream", 115 | "logs:PutLogEvents" 116 | ], 117 | "Effect": "Allow", 118 | "Resource": "arn:aws:logs:*:*:*" 119 | }, 120 | { 121 | "Action": [ 122 | "ses:SendEmail", 123 | "ec2:TerminateInstances", 124 | "autoscaling:DescribeAutoScalingGroups" 125 | ], 126 | "Effect": "Allow", 127 | "Resource": "*" 128 | }, 129 | { 130 | "Action": [ 131 | "sns:Publish" 132 | ], 133 | "Effect": "Allow", 134 | "Resource": { 135 | "Ref": "ChaosLambdaTerminationTopic" 136 | } 137 | } 138 | ], 139 | "Version": "2012-10-17" 140 | }, 141 | "PolicyName": "ChaosLambdaPolicy" 142 | } 143 | ] 144 | }, 145 | "Type": "AWS::IAM::Role" 146 | }, 147 | "ChaosLambdaRule": { 148 | "Properties": { 149 | "Description": "Trigger Chaos Lambda according to a schedule", 150 | "ScheduleExpression": { 151 | "Ref": "Schedule" 152 | }, 153 | "State": "ENABLED", 154 | "Targets": [ 155 | { 156 | "Arn": { 157 | "Fn::GetAtt": [ 158 | "ChaosLambdaFunction", 159 | "Arn" 160 | ] 161 | }, 162 | "Id": "ChaosLambdaRuleTarget" 163 | } 164 | ] 165 | }, 166 | "Type": "AWS::Events::Rule" 167 | }, 168 | "ChaosLambdaRulePermission": { 169 | "Properties": { 170 | "Action": "lambda:InvokeFunction", 171 | "FunctionName": { 172 | "Fn::GetAtt": [ 173 | "ChaosLambdaFunction", 174 | "Arn" 175 | ] 176 | }, 177 | "Principal": "events.amazonaws.com", 178 | "SourceArn": { 179 | "Fn::GetAtt": [ 180 | "ChaosLambdaRule", 181 | "Arn" 182 | ] 183 | } 184 | }, 185 | "Type": "AWS::Lambda::Permission" 186 | }, 187 | "ChaosLambdaTerminationTopic": { 188 | "Type": "AWS::SNS::Topic" 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /cloudformation/templates/metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "Description": "Metrics and filters for Chaos Lambda", 3 | "Parameters": { 4 | "LambdaLogGroupName": { 5 | "Description": "The name of the log group for the lambda function.", 6 | "Type": "String" 7 | } 8 | }, 9 | "Resources": { 10 | "liveliness": { 11 | "Properties": { 12 | "FilterPattern": "[datetime, event=\"triggered\", ...]", 13 | "LogGroupName": { 14 | "Ref": "LambdaLogGroupName" 15 | }, 16 | "MetricTransformations": [ 17 | { 18 | "MetricName": "triggered", 19 | "MetricNamespace": "BBC/CHAOS-LAMBDA", 20 | "MetricValue": "1" 21 | } 22 | ] 23 | }, 24 | "Type": "AWS::Logs::MetricFilter" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/chaos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import time 5 | 6 | import boto3 7 | 8 | 9 | PROBABILITY_TAG = "chaos-lambda-termination" 10 | DEFAULT_PROBABILITY = 1.0 / 6.0 11 | 12 | 13 | def log(*args): 14 | timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) 15 | print(timestamp, *args) 16 | 17 | 18 | def get_asg_tag(asg, name, default=None): 19 | name = name.lower() 20 | for tag in asg.get("Tags", []): 21 | if tag.get("Key", "").lower() == name: 22 | return tag.get("Value", "") 23 | return default 24 | 25 | 26 | def safe_float(s, default): 27 | try: 28 | return float(s) 29 | except ValueError: 30 | return default 31 | 32 | 33 | def get_asg_probability(asg, default): 34 | value = get_asg_tag(asg, PROBABILITY_TAG, None) 35 | if value is None: 36 | return default 37 | 38 | probability = safe_float(value, None) 39 | if probability is not None and 0.0 <= probability <= 1.0: 40 | return probability 41 | 42 | asg_name = asg["AutoScalingGroupName"] 43 | log("bad-probability", "[" + value + "]", "in", asg_name) 44 | return default 45 | 46 | 47 | def get_asg_instance_id(asg, default): 48 | instances = asg.get("Instances", []) 49 | if len(instances) == 0: 50 | return None 51 | 52 | probability = get_asg_probability(asg, default) 53 | if random.random() >= probability: 54 | return None 55 | else: 56 | return random.choice(instances).get("InstanceId", None) 57 | 58 | 59 | def get_all_asgs(autoscaling): 60 | paginator = autoscaling.get_paginator("describe_auto_scaling_groups") 61 | for response in paginator.paginate(): 62 | for asg in response.get("AutoScalingGroups", []): 63 | yield asg 64 | 65 | 66 | def get_targets(autoscaling, default_probability): 67 | targets = [] 68 | for asg in get_all_asgs(autoscaling): 69 | instance_id = get_asg_instance_id(asg, default_probability) 70 | if instance_id is not None: 71 | targets.append((asg["AutoScalingGroupName"], instance_id)) 72 | return targets 73 | 74 | 75 | def send_notification(sns, instance_id, asg_name): 76 | topic = os.environ.get("termination_topic_arn", "").strip() 77 | if topic == '': 78 | return 79 | notification = { 80 | "event_name": "chaos_lambda.terminating", 81 | "instance_id": instance_id, 82 | "asg_name": asg_name, 83 | } 84 | sns.publish( 85 | TopicArn=topic, 86 | Message=json.dumps(notification) 87 | ) 88 | 89 | 90 | def terminate_targets(ec2, sns, targets): 91 | for asg_name, instance_id in targets: 92 | log("targeting", instance_id, "in", asg_name) 93 | try: 94 | send_notification(sns, instance_id, asg_name) 95 | except Exception as e: 96 | log("Failed to send notification", e) 97 | 98 | instance_ids = [instance_id for (asg_name, instance_id) in targets] 99 | response = ec2.terminate_instances(InstanceIds=instance_ids) 100 | 101 | results = [] 102 | for i in response.get("TerminatingInstances", []): 103 | results.append((i["InstanceId"], i["CurrentState"]["Name"])) 104 | 105 | for instance_id, state in results: 106 | log("result", instance_id, "is", state) 107 | 108 | return results 109 | 110 | 111 | def chaos_lambda(regions, default_probability): 112 | for region in regions: 113 | log("triggered", region) 114 | autoscaling = boto3.client("autoscaling", region_name=region) 115 | targets = get_targets(autoscaling, default_probability) 116 | if len(targets) != 0: 117 | ec2 = boto3.client("ec2", region_name=region) 118 | sns = boto3.client("sns", region_name=region) 119 | terminate_targets(ec2, sns, targets) 120 | 121 | 122 | def get_regions(context): 123 | v = os.environ.get("regions", "").strip() 124 | if len(v) == 0: 125 | return [context.invoked_function_arn.split(":")[3]] 126 | else: 127 | return list(filter(None, [s.strip() for s in v.split(",")])) 128 | 129 | 130 | def get_default_probability(): 131 | v = os.environ.get("probability", "").strip() 132 | if len(v) == 0: 133 | return DEFAULT_PROBABILITY 134 | else: 135 | return float(v) 136 | 137 | 138 | def handler(event, context): 139 | regions = get_regions(context) 140 | probability = get_default_probability() 141 | chaos_lambda(regions, probability) 142 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | mock >= 1.0, < 1.1 2 | nose == 1.3.7 3 | -------------------------------------------------------------------------------- /test/base.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import sys 3 | from unittest import TestCase 4 | from unittest.mock import Mock, patch 5 | 6 | 7 | class PatchingTestCase(TestCase): 8 | 9 | patch_list = () 10 | 11 | def setUp(self): 12 | self.patches = [] 13 | for name in self.patch_list: 14 | p = patch(name) 15 | self.patches.append(p) 16 | setattr(self, name.split(".")[-1], p.start()) 17 | 18 | def tearDown(self): 19 | for p in self.patches: 20 | p.stop() 21 | 22 | 23 | @contextlib.contextmanager 24 | def mocked_imports(module_names): 25 | old = {} 26 | mocks = {} 27 | for name in module_names: 28 | old[name] = sys.modules.get(name, None) 29 | sys.modules[name] = mocks[name] = Mock() 30 | yield mocks 31 | for name, module in old.items(): 32 | if module is None: 33 | del sys.modules[name] 34 | else: 35 | sys.modules[name] = module 36 | -------------------------------------------------------------------------------- /test/test_chaos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from unittest import mock 5 | 6 | from base import mocked_imports, PatchingTestCase 7 | 8 | with mocked_imports([ 9 | "boto3" 10 | ]): 11 | import chaos 12 | 13 | 14 | class TestGetASGTag(PatchingTestCase): 15 | 16 | def test_finds_tag_key_case_insensitively(self): 17 | asg = {"Tags": [{"Key": "name", "Value": "success"}]} 18 | self.assertEqual(chaos.get_asg_tag(asg, "NAME"), "success") 19 | self.assertEqual(chaos.get_asg_tag(asg, "name"), "success") 20 | self.assertEqual(chaos.get_asg_tag(asg, "NaMe"), "success") 21 | 22 | def test_returns_default_if_key_not_found(self): 23 | asg = {"Tags": []} 24 | self.assertEqual(chaos.get_asg_tag(asg, "blah"), None) 25 | self.assertEqual(chaos.get_asg_tag(asg, "blah", "abc"), "abc") 26 | 27 | def test_returns_empty_string_if_tag_has_no_value(self): 28 | # As far as I can tell this should never happen, but just in case... 29 | asg = {"Tags": [{"Key": "name"}]} 30 | self.assertEqual(chaos.get_asg_tag(asg, "name"), "") 31 | 32 | 33 | class TestSafeFloat(PatchingTestCase): 34 | 35 | def test_returns_float_if_string_is_valid(self): 36 | self.assertEqual(chaos.safe_float("1.0", 0.5), 1.0) 37 | self.assertEqual(chaos.safe_float("0.0", 0.5), 0.0) 38 | self.assertEqual(chaos.safe_float(" 1.0 ", 0.5), 1.0) 39 | 40 | def test_returns_default_if_string_is_invalid(self): 41 | self.assertEqual(chaos.safe_float("not a number", 0.5), 0.5) 42 | 43 | 44 | class TestGetASGProbability(PatchingTestCase): 45 | 46 | patch_list = ( 47 | "chaos.PROBABILITY_TAG", 48 | "chaos.get_asg_tag", 49 | "chaos.log" 50 | ) 51 | 52 | def get_log_lines(self, name): 53 | lines = [] 54 | for args, kwargs in self.log.call_args_list: 55 | parts = re.findall(r"\[.*?\]|[^ ]+", " ".join(args)) 56 | if parts[0] == name: 57 | lines.append(parts) 58 | return lines 59 | 60 | def test_returns_default_probability_if_no_tag_set(self): 61 | self.get_asg_tag.return_value = None 62 | p = chaos.get_asg_probability({}, mock.sentinel.default) 63 | self.assertEqual(p, mock.sentinel.default) 64 | 65 | def test_queries_probability_tag(self): 66 | self.get_asg_tag.return_value = "0.1" 67 | chaos.get_asg_probability(mock.sentinel.asg, None) 68 | self.get_asg_tag.assert_called_once_with( 69 | mock.sentinel.asg, 70 | self.PROBABILITY_TAG, 71 | mock.ANY 72 | ) 73 | 74 | def test_returns_probability_from_tag_value_if_valid(self): 75 | self.get_asg_tag.return_value = "0.1" 76 | p = chaos.get_asg_probability({"AutoScalingGroupName": "x"}, None) 77 | self.assertEqual(p, 0.1) 78 | 79 | def test_returns_default_probability_if_tag_value_is_invalid(self): 80 | asg = {"AutoScalingGroupName": "x"} 81 | default = mock.sentinel.default 82 | self.get_asg_tag.return_value = "blah" 83 | p = chaos.get_asg_probability(asg, default) 84 | self.assertEqual(p, default) 85 | 86 | def test_returns_default_probability_if_tag_value_is_out_of_range(self): 87 | asg = {"AutoScalingGroupName": "x"} 88 | default = mock.sentinel.default 89 | for value in ("-42", "-1.2", "1.2", "9"): 90 | self.get_asg_tag.return_value = value 91 | p = chaos.get_asg_probability(asg, default) 92 | self.assertEqual(p, default) 93 | 94 | def test_logs_parseable_error_if_tag_value_is_invalid(self): 95 | asg = {"AutoScalingGroupName": "ASGNameHere"} 96 | for value in ("blah", "-42", "0.1 0.2"): 97 | self.log.reset_mock() 98 | self.get_asg_tag.return_value = value 99 | chaos.get_asg_probability(asg, None) 100 | lines = self.get_log_lines("bad-probability") 101 | self.assertEqual(set((p[1], p[3]) for p in lines), set([ 102 | ("[" + value + "]", "ASGNameHere") 103 | ])) 104 | 105 | 106 | class TestGetASGInstanceId(PatchingTestCase): 107 | 108 | patch_list = ( 109 | "chaos.get_asg_probability", 110 | "random.choice", 111 | "random.random", 112 | ) 113 | 114 | def test_returns_None_if_there_are_no_instances(self): 115 | self.random.return_value = 1.0 116 | asg = {"Instances": []} 117 | self.assertEqual(chaos.get_asg_instance_id(asg, 0), None) 118 | asg = {} 119 | self.assertEqual(chaos.get_asg_instance_id(asg, 0), None) 120 | 121 | def test_returns_None_if_probability_test_fails(self): 122 | self.choice.side_effect = lambda l: l[0] 123 | self.get_asg_probability.return_value = 0.5 124 | self.random.return_value = 1.0 125 | asg = {"Instances": [{"InstanceId": "i-1234abcd"}]} 126 | default = mock.sentinel.default 127 | self.assertEqual(chaos.get_asg_instance_id(asg, default), None) 128 | self.get_asg_probability.assert_called_once_with(asg, default) 129 | 130 | def test_returns_instance_id_if_probability_test_succeeds(self): 131 | self.choice.side_effect = lambda l: l[0] 132 | self.get_asg_probability.return_value = 0.5 133 | self.random.return_value = 0.0 134 | asg = {"Instances": [{"InstanceId": "i-1234abcd"}]} 135 | default = mock.sentinel.default 136 | self.assertEqual(chaos.get_asg_instance_id(asg, default), "i-1234abcd") 137 | self.get_asg_probability.assert_called_once_with(asg, default) 138 | 139 | def test_returns_random_choice_of_instance_ids(self): 140 | self.get_asg_probability.return_value = 0.5 141 | self.random.return_value = 0.0 142 | instances = [ 143 | {"InstanceId": "i-00000000"}, 144 | {"InstanceId": "i-11111111"}, 145 | {"InstanceId": "i-22222222"} 146 | ] 147 | i = chaos.get_asg_instance_id({"Instances": instances}, 0) 148 | self.choice.assert_called_once_with(instances) 149 | self.assertEqual(i, self.choice.return_value.get.return_value) 150 | 151 | 152 | class TestGetAllASGs(PatchingTestCase): 153 | 154 | def test_uses_paginator_for_describe_auto_scaling_groups(self): 155 | autoscaling = mock.Mock() 156 | paginator = autoscaling.get_paginator.return_value 157 | paginator.paginate.return_value = iter([]) 158 | asgs = chaos.get_all_asgs(autoscaling) 159 | list(asgs) # force evaluation of the generator 160 | autoscaling.get_paginator.assert_called_once_with( 161 | "describe_auto_scaling_groups" 162 | ) 163 | paginator.paginate.assert_called_once_with() 164 | 165 | def test_yields_asgs_from_each_response(self): 166 | autoscaling = mock.Mock() 167 | paginator = autoscaling.get_paginator.return_value 168 | paginator.paginate.return_value = iter([ 169 | {"AutoScalingGroups": [mock.sentinel.one, mock.sentinel.two]}, 170 | {"AutoScalingGroups": [mock.sentinel.three]}, 171 | {"AutoScalingGroups": [mock.sentinel.four, mock.sentinel.five]} 172 | ]) 173 | asgs = chaos.get_all_asgs(autoscaling) 174 | self.assertEqual(set(asgs), set([ 175 | mock.sentinel.one, 176 | mock.sentinel.two, 177 | mock.sentinel.three, 178 | mock.sentinel.four, 179 | mock.sentinel.five 180 | ])) 181 | 182 | def test_ignores_responses_with_missing_AutoScalingGroups_key(self): 183 | autoscaling = mock.Mock() 184 | paginator = autoscaling.get_paginator.return_value 185 | paginator.paginate.return_value = iter([ 186 | {"AutoScalingGroups": [mock.sentinel.one]}, 187 | {}, 188 | {"AutoScalingGroups": [mock.sentinel.two]} 189 | ]) 190 | asgs = chaos.get_all_asgs(autoscaling) 191 | self.assertEqual(set(asgs), set([ 192 | mock.sentinel.one, 193 | mock.sentinel.two 194 | ])) 195 | 196 | 197 | class TestGetTargets(PatchingTestCase): 198 | 199 | patch_list = ( 200 | "chaos.get_all_asgs", 201 | "chaos.get_asg_instance_id", 202 | ) 203 | 204 | def test_requests_all_auto_scaling_groups(self): 205 | autoscaling = mock.Mock() 206 | self.get_all_asgs.return_value = iter([]) 207 | chaos.get_targets(autoscaling, 0) 208 | self.get_all_asgs.assert_called_once_with(autoscaling) 209 | 210 | def test_returns_empty_list_if_no_auto_scaling_groups(self): 211 | autoscaling = mock.Mock() 212 | self.get_all_asgs.return_value = iter([]) 213 | self.assertEqual(chaos.get_targets(autoscaling, 0), []) 214 | 215 | def test_passes_default_probability_to_get_asg_instance_id(self): 216 | autoscaling = mock.Mock() 217 | asg = {"AutoScalingGroupName": "a", "Instances": ["i-11111111"]} 218 | default = mock.sentinel.default_probablity 219 | self.get_asg_instance_id.return_value = None 220 | self.get_all_asgs.return_value = iter([asg]) 221 | chaos.get_targets(autoscaling, default) 222 | self.get_asg_instance_id.assert_called_once_with(asg, default) 223 | 224 | def test_gets_instance_from_each_asg(self): 225 | autoscaling = mock.Mock() 226 | self.get_asg_instance_id.side_effect = lambda asg, default: \ 227 | asg["Instances"][0] 228 | self.get_all_asgs.return_value = iter([ 229 | {"AutoScalingGroupName": "a", "Instances": ["i-11111111"]}, 230 | {"AutoScalingGroupName": "b", "Instances": ["i-22222222"]}, 231 | {"AutoScalingGroupName": "c", "Instances": ["i-33333333"]} 232 | ]) 233 | targets = chaos.get_targets(autoscaling, 0) 234 | self.assertEqual(set(targets), set([ 235 | ("a", "i-11111111"), 236 | ("b", "i-22222222"), 237 | ("c", "i-33333333") 238 | ])) 239 | 240 | def test_ignores_asgs_with_no_instances(self): 241 | autoscaling = mock.Mock() 242 | self.get_asg_instance_id.side_effect = lambda asg, default: \ 243 | asg["Instances"][0] if len(asg["Instances"]) != 0 else None 244 | self.get_all_asgs.return_value = iter([ 245 | {"AutoScalingGroupName": "a", "Instances": []}, 246 | {"AutoScalingGroupName": "b", "Instances": ["i-22222222"]}, 247 | {"AutoScalingGroupName": "c", "Instances": []} 248 | ]) 249 | targets = chaos.get_targets(autoscaling, 0) 250 | self.assertEqual(targets, [("b", "i-22222222")]) 251 | 252 | 253 | class TestTerminateTargets(PatchingTestCase): 254 | 255 | patch_list = ( 256 | "chaos.log", 257 | "chaos.os" 258 | ) 259 | 260 | def get_log_lines(self, name): 261 | lines = [] 262 | for args, kwargs in self.log.call_args_list: 263 | parts = re.findall(r"\[.*?\]|[^ ]+", " ".join(args)) 264 | if parts[0] == name: 265 | lines.append(parts) 266 | return lines 267 | 268 | def test_terminates_target_instances(self): 269 | ec2 = mock.Mock() 270 | sns = mock.Mock() 271 | ec2.terminate_instances.return_value = {} 272 | chaos.terminate_targets(ec2, sns, [ 273 | ("a", "i-11111111"), 274 | ("b", "i-22222222") 275 | ]) 276 | ec2.terminate_instances.assert_called_once_with( 277 | InstanceIds=["i-11111111", "i-22222222"] 278 | ) 279 | 280 | def test_parseable_log_line_for_each_targeted_instance(self): 281 | ec2 = mock.Mock() 282 | sns = mock.Mock() 283 | ec2.terminate_instances.return_value = {} 284 | chaos.terminate_targets(ec2, sns, [ 285 | ("asg-name-one", "i-00000000"), 286 | ("second-asg", "i-11111111"), 287 | ("the-third-asg", "i-22222222") 288 | ]) 289 | logged = self.get_log_lines("targeting") 290 | self.assertEqual(set((part[1], part[3]) for part in logged), set([ 291 | ("i-00000000", "asg-name-one"), 292 | ("i-11111111", "second-asg"), 293 | ("i-22222222", "the-third-asg") 294 | ])) 295 | 296 | def test_parseable_log_line_for_each_termination_result(self): 297 | ec2 = mock.Mock() 298 | sns = mock.Mock() 299 | # We're cheating here and returning results that are unrelated to the 300 | # list passed to terminate_targets 301 | ec2.terminate_instances.return_value = { 302 | "TerminatingInstances": [ 303 | {"InstanceId": "i-00000000", "CurrentState": {"Name": "s1"}}, 304 | {"InstanceId": "i-11111111", "CurrentState": {"Name": "s2"}}, 305 | {"InstanceId": "i-22222222", "CurrentState": {"Name": "s3"}} 306 | ] 307 | } 308 | chaos.terminate_targets(ec2, sns, [("a", "i-11111111")]) 309 | logged = self.get_log_lines("result") 310 | self.assertEqual(set((part[1], part[3]) for part in logged), set([ 311 | ("i-00000000", "s1"), 312 | ("i-11111111", "s2"), 313 | ("i-22222222", "s3") 314 | ])) 315 | 316 | def test_returns_termination_results(self): 317 | ec2 = mock.Mock() 318 | sns = mock.Mock() 319 | # We're cheating here and returning results that are unrelated to the 320 | # list passed to terminate_targets 321 | ec2.terminate_instances.return_value = { 322 | "TerminatingInstances": [ 323 | {"InstanceId": "i-00000000", "CurrentState": {"Name": "s1"}}, 324 | {"InstanceId": "i-11111111", "CurrentState": {"Name": "s2"}}, 325 | {"InstanceId": "i-22222222", "CurrentState": {"Name": "s3"}} 326 | ] 327 | } 328 | results = chaos.terminate_targets(ec2, sns, []) 329 | self.assertEqual(set(results), set([ 330 | ("i-00000000", "s1"), 331 | ("i-11111111", "s2"), 332 | ("i-22222222", "s3") 333 | ])) 334 | 335 | def test_sends_notification_per_instance(self): 336 | self.os.environ.get.return_value = "MyTestTopic" 337 | ec2 = mock.Mock() 338 | sns = mock.Mock() 339 | ec2.terminate_instances.return_value = { 340 | "TerminatingInstances": [] 341 | } 342 | chaos.terminate_targets(ec2, sns, [("a1", "i1"), ("a2", "i2")]) 343 | sns.publish.assert_any_call( 344 | TopicArn="MyTestTopic", 345 | Message=MatchJson({ 346 | "event_name": "chaos_lambda.terminating", 347 | "asg_name": "a1", 348 | "instance_id": "i1" 349 | }) 350 | ) 351 | sns.publish.assert_any_call( 352 | TopicArn="MyTestTopic", 353 | Message=MatchJson({ 354 | "event_name": "chaos_lambda.terminating", 355 | "asg_name": "a2", 356 | "instance_id": "i2" 357 | }) 358 | ) 359 | self.assertEqual(2, sns.publish.call_count) 360 | 361 | def test_handles_sns_exception(self): 362 | self.os.environ.get.return_value = "MyTestTopic" 363 | ec2 = mock.Mock() 364 | sns = mock.Mock() 365 | ec2.terminate_instances.return_value = {} 366 | 367 | sns.publish.side_effect = Exception("boom") 368 | chaos.terminate_targets(ec2, sns, [ 369 | ("a", "i-11111111"), 370 | ("b", "i-22222222") 371 | ]) 372 | ec2.terminate_instances.assert_called_once_with( 373 | InstanceIds=["i-11111111", "i-22222222"] 374 | ) 375 | 376 | 377 | class MatchJson: 378 | ''' 379 | A JSON Matcher that takes a Dictionary as input, checking that those 380 | specified keys and values exist in the JSON string that is supplied. It 381 | does not check if there are any other keys in the JSON string. 382 | ''' 383 | def __init__(self, expected): 384 | self.expected = expected 385 | 386 | def __repr__(self): 387 | return "'" + json.dumps(self.expected) + "'" 388 | 389 | def __eq__(self, json_string): 390 | try: 391 | parsed_json = json.loads(json_string) 392 | for key in self.expected.keys(): 393 | try: 394 | if self.expected[key] != parsed_json[key]: 395 | return False 396 | except KeyError: 397 | print("The key '" + key + "' does not exist") 398 | return False 399 | except ValueError: 400 | print("Message passed to sns.publish was not valid JSON") 401 | return False 402 | return True 403 | 404 | 405 | class TestChaosLambda(PatchingTestCase): 406 | 407 | patch_list = ( 408 | "chaos.boto3", 409 | "chaos.get_targets", 410 | "chaos.log", 411 | "chaos.terminate_targets", 412 | ) 413 | 414 | def setUp(self): 415 | super(TestChaosLambda, self).setUp() 416 | self.clients = {} 417 | self.boto3.client.side_effect = self.make_client 418 | 419 | def make_client(self, name, region_name): 420 | c = self.clients.get(name, None) 421 | if c is not None: 422 | self.assertEqual(c.region_name, region_name) 423 | else: 424 | c = self.clients[name] = mock.Mock(region_name=region_name) 425 | return c 426 | 427 | def test_parseable_log_line_for_trigger(self): 428 | self.get_targets.return_value = [] 429 | chaos.chaos_lambda(["sp-moonbase-1"], 0) 430 | self.log.assert_called_once_with("triggered", "sp-moonbase-1") 431 | 432 | def test_does_nothing_if_no_targets(self): 433 | self.get_targets.return_value = [] 434 | chaos.chaos_lambda(["sp-moonbase-1"], 0) 435 | self.assertEqual(self.terminate_targets.call_count, 0) 436 | 437 | def test_uses_autoscaling_service_in_correct_region(self): 438 | self.get_targets.return_value = [] 439 | chaos.chaos_lambda(["sp-moonbase-1"], 0) 440 | autoscaling = self.get_targets.call_args[0][0] 441 | self.assertEqual(autoscaling, self.clients["autoscaling"]) 442 | self.assertEqual(autoscaling.region_name, "sp-moonbase-1") 443 | 444 | def test_passes_default_probability_to_get_targets(self): 445 | default = mock.sentinel.default 446 | self.get_targets.return_value = [] 447 | chaos.chaos_lambda(["sp-moonbase-1"], default) 448 | self.assertEqual(self.get_targets.call_args[0][1], default) 449 | 450 | def test_terminates_target_instances_in_correct_region(self): 451 | targets = [("a", "i-11111111"), ("b", "i-22222222")] 452 | self.get_targets.return_value = targets 453 | ec2 = self.make_client("ec2", region_name="sp-moonbase-1") 454 | sns = self.make_client("sns", region_name="sp-moonbase-1") 455 | chaos.chaos_lambda(["sp-moonbase-1"], 0) 456 | # Above triggers self.make_client, which checks the region name 457 | self.terminate_targets.assert_called_once_with(ec2, sns, targets) 458 | 459 | 460 | class TestGetRegions(PatchingTestCase): 461 | 462 | patch_list = ( 463 | "chaos.os", 464 | ) 465 | 466 | def test_looks_for_a_regions_environment_variable(self): 467 | self.os.environ.get.return_value = "" 468 | context = mock.Mock() 469 | context.invoked_function_arn = "arn:aws:lambda:re-gion-1:..." 470 | chaos.get_regions(context) 471 | self.os.environ.get.assert_called_once_with("regions", "") 472 | 473 | def test_extracts_region_from_context_if_no_regions_variable(self): 474 | self.os.environ.get.return_value = "" 475 | context = mock.Mock() 476 | for region in ("eu-west-1", "sp-moonbase-1"): 477 | context.invoked_function_arn = "arn:aws:lambda:" + region + ":..." 478 | result = chaos.get_regions(context) 479 | self.assertEqual(result, [region]) 480 | 481 | def test_reads_from_comma_separated_regions_variable_if_set(self): 482 | self.os.environ.get.return_value = "re-gion-1,sp-moonbase-1" 483 | result = chaos.get_regions(mock.Mock()) 484 | self.assertEqual(result, ["re-gion-1", "sp-moonbase-1"]) 485 | 486 | def test_ignores_whitespace_in_regions_variable(self): 487 | self.os.environ.get.return_value = "\n sp-moonbase-1\n, re-gion-1 " 488 | result = chaos.get_regions(mock.Mock()) 489 | self.assertEqual(result, ["sp-moonbase-1", "re-gion-1"]) 490 | 491 | 492 | class TestGetDefaultProbability(PatchingTestCase): 493 | 494 | patch_list = ( 495 | "chaos.os", 496 | ) 497 | 498 | def test_looks_for_a_probability_environment_variable(self): 499 | self.os.environ.get.return_value = "" 500 | chaos.get_default_probability() 501 | self.os.environ.get.assert_called_once_with("probability", "") 502 | 503 | def test_returns_default_if_no_probability_variable(self): 504 | self.os.environ.get.return_value = "" 505 | p = chaos.get_default_probability() 506 | self.assertEqual(p, chaos.DEFAULT_PROBABILITY) 507 | 508 | def test_returns_float_value_of_probability_variable(self): 509 | for s in ("0.1", "0.2", "0.3"): 510 | self.os.environ.get.return_value = s 511 | p = chaos.get_default_probability() 512 | self.assertEqual(p, float(s)) 513 | 514 | def test_ignores_whitespace_in_probability_variable(self): 515 | self.os.environ.get.return_value = " \n0.1 " 516 | p = chaos.get_default_probability() 517 | self.assertEqual(p, 0.1) 518 | 519 | 520 | class TestHandler(PatchingTestCase): 521 | 522 | patch_list = ( 523 | "chaos.chaos_lambda", 524 | "chaos.get_default_probability", 525 | "chaos.get_regions", 526 | ) 527 | 528 | def test_passes_along_the_region_list(self): 529 | context = mock.sentinel.context 530 | chaos.handler(None, context) 531 | self.get_regions.assert_called_once_with(context) 532 | self.chaos_lambda.assert_called_once_with( 533 | self.get_regions.return_value, 534 | mock.ANY 535 | ) 536 | 537 | def test_passes_along_the_default_probability(self): 538 | chaos.handler(None, mock.Mock()) 539 | self.get_default_probability.assert_called_once_with() 540 | self.chaos_lambda.assert_called_once_with( 541 | mock.ANY, 542 | self.get_default_probability.return_value 543 | ) 544 | --------------------------------------------------------------------------------