├── docs ├── images │ ├── empty_cw_dashboard.png │ ├── aws_managed_credentials.png │ └── serverless-etl-architecture.png ├── lab_2_inject_fault.md ├── lab_1_serverless_etl.md ├── lab_4_chaos_experiment_2.md └── lab_3_chaos_experiment.md ├── .gitignore ├── Pipfile ├── terraform ├── template.tf ├── database.tf ├── filestore.tf ├── messaging.tf ├── application.tf ├── outputs.tf └── monitoring.tf ├── src ├── package.json └── lambda.js ├── drivers ├── the_subscriber.py └── the_publisher.py └── README.md /docs/images/empty_cw_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpbarto/serverless-chaos-lab/HEAD/docs/images/empty_cw_dashboard.png -------------------------------------------------------------------------------- /docs/images/aws_managed_credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpbarto/serverless-chaos-lab/HEAD/docs/images/aws_managed_credentials.png -------------------------------------------------------------------------------- /docs/images/serverless-etl-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jpbarto/serverless-chaos-lab/HEAD/docs/images/serverless-etl-architecture.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | node_modules 3 | *.tfstate* 4 | .terraform 5 | Pipfile.lock 6 | chaostoolkit.log 7 | journal.json 8 | tmp_data 9 | notes.md 10 | *.pyc 11 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | chaostoolkit = "*" 10 | chaostoolkit-aws = "*" 11 | 12 | [requires] 13 | python_version = "3.6" 14 | -------------------------------------------------------------------------------- /terraform/template.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.12" 3 | } 4 | 5 | provider "aws" { 6 | } 7 | 8 | data "aws_caller_identity" "current" {} 9 | data "aws_region" "current" {} 10 | 11 | resource "random_id" "chaos_stack" { 12 | byte_length = 8 13 | } -------------------------------------------------------------------------------- /terraform/database.tf: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # 3 | # Chaos-prepared DynamoDB database 4 | # 5 | ######################################### 6 | 7 | resource "aws_dynamodb_table" "chaos_data_table" { 8 | name = "chaos-data-${random_id.chaos_stack.hex}" 9 | billing_mode = "PAY_PER_REQUEST" 10 | hash_key = "symbol" 11 | range_key = "entryType" 12 | 13 | attribute { 14 | name = "symbol" 15 | type = "S" 16 | } 17 | 18 | attribute { 19 | name = "entryType" 20 | type = "S" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lambda_handler", 3 | "version": "1.0.0", 4 | "description": "Simple serverless ETL Lambda function", 5 | "main": "lambda.js", 6 | "dependencies": { 7 | "aws-sdk": "^2.619.0", 8 | "failure-lambda": "^0.4.1", 9 | "json2csv": "^4.5.4", 10 | "uuid": "^8.3.2" 11 | }, 12 | "devDependencies": {}, 13 | "scripts": { 14 | "test": "echo \"Error: no test specified\" && exit 1" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/jpbarto/serverless-chaos-lab.git" 19 | }, 20 | "keywords": [ 21 | "serverless", 22 | "chaos", 23 | "etl" 24 | ], 25 | "author": "Jason Barto", 26 | "license": "Apache-2.0", 27 | "bugs": { 28 | "url": "https://github.com/jpbarto/serverless-chaos-lab/issues" 29 | }, 30 | "homepage": "https://github.com/jpbarto/serverless-chaos-lab#readme" 31 | } 32 | -------------------------------------------------------------------------------- /terraform/filestore.tf: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # 3 | # S3 bucket for receiving new data inputs 4 | # 5 | ######################################### 6 | 7 | resource "aws_s3_bucket" "chaos_bucket" { 8 | bucket = "chaos-bucket-${random_id.chaos_stack.hex}" 9 | force_destroy = true 10 | } 11 | 12 | resource "aws_s3_bucket_public_access_block" "example" { 13 | bucket = aws_s3_bucket.chaos_bucket.id 14 | 15 | block_public_acls = true 16 | block_public_policy = true 17 | ignore_public_acls = true 18 | restrict_public_buckets = true 19 | } 20 | 21 | output "chaos_bucket_name" { 22 | value = aws_s3_bucket.chaos_bucket.id 23 | } 24 | 25 | resource "aws_s3_bucket_notification" "chaos_bucket_notifications" { 26 | bucket = aws_s3_bucket.chaos_bucket.id 27 | 28 | queue { 29 | queue_arn = aws_sqs_queue.chaos_json_queue.arn 30 | events = ["s3:ObjectCreated:*"] 31 | filter_prefix = "input/" 32 | filter_suffix = ".json" 33 | } 34 | 35 | topic { 36 | topic_arn = aws_sns_topic.chaos_topic.arn 37 | events = ["s3:ObjectCreated:*"] 38 | filter_prefix = "output/" 39 | filter_suffix = ".csv" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /drivers/the_subscriber.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import boto3 4 | from time import time, sleep 5 | from datetime import datetime as dt 6 | import json 7 | import sys 8 | 9 | from aws_resource_names import SQS_QUEUE_NAME 10 | 11 | sqs = boto3.client('sqs') 12 | queue_url = sqs.get_queue_url (QueueName=SQS_QUEUE_NAME) 13 | queue_url = queue_url['QueueUrl'] 14 | 15 | run_flag = True 16 | 17 | obj_count = 0 18 | iter_obj_count = 0 19 | obj_limit = 2 # number of objects per second to get 20 | start_time = time () 21 | last_print_time = time () 22 | seg_start_time = time () 23 | while run_flag: 24 | try: 25 | # check every 10 sec to keep at the desired get rate 26 | if iter_obj_count < obj_limit * 10: 27 | resp = sqs.receive_message (QueueUrl=queue_url, WaitTimeSeconds=1, MaxNumberOfMessages=obj_limit) 28 | if 'Messages' in resp: 29 | for msg in resp['Messages']: 30 | sqs.delete_message (QueueUrl=queue_url, ReceiptHandle=msg['ReceiptHandle']) 31 | obj_count += 1 32 | iter_obj_count += 1 33 | else: 34 | sleep_time = 10 - (time () - seg_start_time) 35 | if sleep_time > 0: 36 | sleep (sleep_time) 37 | print ("{}: Retrieved {} objects for a total of {} objects".format (dt.now ().strftime ('%Y-%b-%d %H:%M:%S'), iter_obj_count, obj_count)) 38 | seg_start_time = time () 39 | iter_obj_count = 0 40 | 41 | except KeyboardInterrupt: 42 | print ("Retrieved a total of {} objects; exiting...".format (obj_count)) 43 | sys.exit (0) 44 | -------------------------------------------------------------------------------- /drivers/the_publisher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import boto3 4 | from time import time, sleep 5 | import sys 6 | from datetime import datetime as dt 7 | import json 8 | from random import random, shuffle 9 | from itertools import product 10 | from string import ascii_uppercase 11 | 12 | from aws_resource_names import S3_BUCKET_NAME 13 | 14 | # Method 2: Client.put_object() 15 | s3 = boto3.client('s3') 16 | 17 | run_flag = True 18 | 19 | obj_count = 0 20 | iter_obj_count = 0 21 | obj_limit = 2 # number of objects per second to put 22 | err_rate = 0.01 # what percentage of messages should be flawed, 0.1 == 10% of messages will have syntax errors 23 | start_time = time () 24 | symbols = [''.join(i) for i in product (ascii_uppercase, repeat=4)] 25 | shuffle (symbols) 26 | print ("Publishing messages for {} symbols".format (len (symbols))) 27 | 28 | message_id = 0 29 | seg_start_time = time () 30 | try: 31 | while run_flag: 32 | # go to sleep every 10 seconds to try to keep publication rate at the obj_limit setting 33 | if iter_obj_count < obj_limit * 10: 34 | symbol = symbols.pop () 35 | message_id += 1 36 | obj_name = 'data_object_msg-{}.json'.format (message_id) 37 | data = {'symbol': symbol, 'messageId': message_id, 'value': 10, 'objectName': obj_name, 'submissionDate': dt.now().strftime ('%d-%b-%Y %H:%M:%S'), 'author': 'the_publisher.py', 'version': 1.1} 38 | body = json.dumps (data) 39 | if random () < err_rate: 40 | body = body.replace ('"','',1) # if we should inject an erroneous message send malformed JSON with a syntax error 41 | s3.put_object(Body=body, Bucket=S3_BUCKET_NAME, Key='input/{}'.format (obj_name)) 42 | obj_count += 1 43 | iter_obj_count += 1 44 | else: 45 | # we've published enough messages for this 10 sec segment, sleep now 46 | sleep_time = 10 - (time () - seg_start_time) 47 | if sleep_time > 0: 48 | sleep (sleep_time) 49 | print ("{}: Pushed {} objects for a total of {} objects".format (dt.now ().strftime ('%Y-%b-%d %H:%M:%S'), iter_obj_count, obj_count)) 50 | seg_start_time = time () 51 | iter_obj_count = 0 52 | 53 | except KeyboardInterrupt: 54 | print ("Pushed a total of {} objects; exiting...".format (obj_count)) 55 | sys.exit (0) 56 | -------------------------------------------------------------------------------- /terraform/messaging.tf: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # 3 | # Chaos File Processed topic 4 | # 5 | ######################################### 6 | 7 | resource "aws_sns_topic" "chaos_topic" { 8 | name = "chaos-csv-notification-topic-${random_id.chaos_stack.hex}" 9 | 10 | policy = < { 25 | console.log('Lambda instance ID:', lambdaInstanceId, ': v1 ETL processor handling event', JSON.stringify(event)); 26 | 27 | var s3Event = JSON.parse(event.Records[0].body); 28 | console.log('Extracted S3 event', JSON.stringify(s3Event)); 29 | 30 | // retrieve key fields from the S3 event object 31 | var srcBucket = s3Event.Records[0].s3.bucket.name; 32 | // Object key may have spaces or unicode non-ASCII characters. 33 | var srcKey = decodeURIComponent(s3Event.Records[0].s3.object.key.replace(/\+/g, " ")); 34 | var dstBucket = srcBucket; 35 | var dstKey = "output/" + srcKey.replace(/input\//g, "") + ".csv"; 36 | 37 | console.log("Reading JSON file", srcKey, "in bucket", srcBucket); 38 | 39 | const data = await s3.getObject({ Bucket: srcBucket, Key: srcKey }).promise(); 40 | var jsonData = JSON.parse(data.Body.toString('utf-8')); 41 | console.log("Retrieved JSON data:", jsonData); 42 | 43 | // Update the database with the latest summary of the symbol 44 | let params = { 45 | TableName: chaosDataTable, 46 | Key: { 47 | "symbol": jsonData.symbol, 48 | "entryType": "latest" 49 | }, 50 | UpdateExpression: "ADD updateCount :i, symbolValue :v SET lastMessage = :mid", 51 | ExpressionAttributeValues: { 52 | ":mid": jsonData.messageId, 53 | ":v": jsonData.value, 54 | ":i": 1 55 | }, 56 | ReturnValues: "UPDATED_NEW" 57 | }; 58 | 59 | ddb.update(params, function(err, data) { 60 | if (err) { 61 | console.error("Unable to update", jsonData.symbol, "aggregate record in DynamoDB, Error JSON:", JSON.stringify(err, null, 2)); 62 | } 63 | else { 64 | console.log("Updated DynamoDB for", jsonData.symbol, ":", JSON.stringify(data, null, 2)); 65 | } 66 | }); 67 | 68 | // record the individual record 69 | var dateStr = (new Date()).toISOString(); 70 | params = { 71 | TableName: chaosDataTable, 72 | Key: { 73 | "symbol": jsonData.symbol, 74 | "entryType": dateStr + "#" + jsonData.messageId 75 | }, 76 | UpdateExpression: "SET symbolValue =:v, processingTimestamp = :d, messageId = :mid", 77 | ExpressionAttributeValues: { 78 | ":mid": jsonData.messageId, 79 | ":v": jsonData.value, 80 | ":d": dateStr 81 | }, 82 | ReturnValues: "UPDATED_NEW" 83 | }; 84 | ddb.update(params, function(err, data) { 85 | if (err) { 86 | console.error("Unable to record message ID in DynamoDB, Error JSON:", JSON.stringify(err, null, 2)); 87 | } 88 | else { 89 | console.log("Recorded", jsonData.symbol, "message ID", jsonData.messageId, "in DynamoDB", JSON.stringify(data, null, 2)); 90 | 91 | var cwParams = { 92 | MetricData: [{ 93 | MetricName: 'SymbolWriteCount', 94 | Dimensions: [{ 95 | Name: 'DynamoDBTable', 96 | Value: chaosDataTable 97 | }], 98 | StorageResolution: 1, 99 | Timestamp: new Date(), 100 | Unit: 'Count', 101 | Value: 1, 102 | }], 103 | Namespace: 'ChaosTransformer' 104 | }; 105 | 106 | cloudwatch.putMetricData(cwParams, function(err, data) { 107 | if (err) { 108 | console.log("Error logging custom metrics:", err, err.stack); 109 | } 110 | else { 111 | console.log("Successfully logged custom metric update:", data); 112 | } 113 | }); 114 | 115 | } 116 | }); 117 | 118 | 119 | /** 120 | * Perform the ETL and write the converted data to S3 121 | */ 122 | try { 123 | const csvData = parse(jsonData, opts); 124 | console.log("Parsed CSV data from JSON:", csvData); 125 | const result = await s3.putObject({ Bucket: dstBucket, Key: dstKey, Body: csvData, ContentType: 'text/csv' }).promise(); 126 | } 127 | catch (err) { 128 | console.error(err); 129 | callback(err); 130 | } 131 | 132 | /** 133 | * Respond completion back to the Lambda systems 134 | * */ 135 | const response = { 136 | statusCode: 200, 137 | body: JSON.stringify('Input conversion complete') 138 | }; 139 | 140 | console.log("ETL processer completed processing of", srcKey, "in bucket", srcBucket); 141 | return response; 142 | }); 143 | -------------------------------------------------------------------------------- /docs/lab_2_inject_fault.md: -------------------------------------------------------------------------------- 1 | # Lab 2: Evaluate failure injection with AWS Lambda 2 | 3 | ## Overview 4 | 5 | In this lab you will use the [Failure-Lambda](https://www.npmjs.com/package/failure-lambda) NPM package to inject failures into the AWS Lambda function of your ETL architecture. 6 | 7 | ## Failure-Lambda 8 | 9 | A challenge with serverless services like DynamoDB, SQS, and Lambda is that you cannot directly affect the underlying service to introduce disruption. Some developers have begun to create abilities to simulate failures however and we will use one such library today to disrupt the Lambda function in our architecture. 10 | 11 | [Failure-Lambda](https://github.com/gunnargrosch/failure-lambda) was created by [Gunnar Grosch](https://grosch.se/) to allow teams to inject turbulence into NodeJS code. It provides a wrapper around NodeJS functions and can artificially cause: 12 | 13 | - latency 14 | - network disruption 15 | - disk out of space errors 16 | - artificial exceptions 17 | - return a specific response code 18 | 19 | If you review the [code](../src/lambda.js) for your AWS Lambda function you will note that it already has the Failure-Lambda wrapper in place. The Lambda function also has an environment variable defined which points at a key-value pair in AWS Parameter Store. Have a look at the [AWS Lambda console](https://console.aws.amazon.com/lambda/home?#/functions) to find the name of the parameter and then review the current value of the parameter in the [Parameter Store console](https://console.aws.amazon.com/systems-manager/parameters?). 20 | 21 | ## Test your Lambda Function 22 | 23 | 1. Create an AWS Lambda test event 24 | 25 | To begin experimenting with the Failure-Lambda package we will need to first create a Test Event that we can use to execute the Lambda function. Looking at the detail page for your Lambda function on the [AWS Lambda console](https://console.aws.amazon.com/lambda/home?#/functions) you will see in the upper-right of the screen a drop down labeled `Select a test event` next to a `Test` button. From the drop down select `Configure test events`. 26 | 27 | 1. Define the event body 28 | 29 | Give the event a name such as `TestObject001` and use the following JSON. Be sure and modify the JSON replacing the two occurrences of `< YOUR S3 BUCKET NAME >` with the name of the S3 bucket created in the first lab. 30 | 31 | ```json 32 | { 33 | "Records": [ 34 | { 35 | "messageId": "129ec50c-d702-4754-aaba-efd5376c63ab", 36 | "receiptHandle": "AQEB/4rCbig4Mkm6JphKbx9okY=", 37 | "body": "{\"Records\":[{\"eventVersion\":\"2.1\",\"eventSource\":\"aws:s3\",\"awsRegion\":\"eu-west-2\",\"eventTime\":\"2020-03-08T00:40:44.110Z\",\"eventName\":\"ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"AWS:AROAZXQEFEFVE:i-00f000f4c212ad0d4\"},\"requestParameters\":{\"sourceIPAddress\":\"3.9.176.208\"},\"responseElements\":{\"x-amz-request-id\":\"81EBAE99F537B548\",\"x-amz-id-2\":\"7AZjqqd/C7ptM8LijtnCWEV\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"tf-s3-queue-20203213469500000002\",\"bucket\":{\"name\":\"< YOUR S3 BUCKET NAME >\",\"ownerIdentity\":{\"principalId\":\"A3N0SH17G\"},\"arn\":\"arn:aws:s3:::< YOUR S3 BUCKET NAME >\"},\"object\":{\"key\":\"input/data_object_msg-1.json\",\"size\":175,\"eTag\":\"4bb6a876175bd3a503be348dcc5fbd9f\",\"sequencer\":\"005E643F0D2EB9D5EA\"}}}]}", 38 | "attributes": { 39 | "ApproximateReceiveCount": "1", 40 | "SentTimestamp": "1583628046604", 41 | "SenderId": "AIDAIKZTX7KCMABLW", 42 | "ApproximateFirstReceiveTimestamp": "1583628050669" 43 | }, 44 | "messageAttributes": {}, 45 | "md5OfBody": "57107cfd574671604fde285823dcdef7", 46 | "eventSource": "aws:sqs", 47 | "eventSourceARN": "arn:aws:sqs:eu-west-2:771234451234:chaos-json-work-queue-cedabABCD32b8513", 48 | "awsRegion": "eu-west-2" 49 | } 50 | ] 51 | } 52 | ``` 53 | 54 | 1. Perform a successful test 55 | 56 | Before we start injecting failures lets ensure your function is testing normally. Click the `Test` button with your new test event defined. The test should result in a `Succeeded` status. 57 | 58 | 1. Configure Failure-Lambda for latency injection 59 | 60 | Modify the parameter store value you found earlier to have the following value: 61 | 62 | ```json 63 | { 64 | "isEnabled": true, 65 | "failureMode": "latency", 66 | "rate": 1, 67 | "minLatency": 1000, 68 | "maxLatency": 5000 69 | } 70 | ``` 71 | 72 | This instructs the Failure-Lambda wrapper in the Lambda function to inject latency for every execution (`rate: 1`) with a random latency between 1 second and 5 seconds. 73 | 74 | 1. Execute the impaired Lambda function 75 | 76 | Return to the AWS Lambda console and re-run your Lambda test, observe the results. How long did the function take to execute? If you execute it again how long does it take a second time around? What is the impact of the latency injection? Does it cause any failures? 77 | 78 | 1. Configure Failure-Lambda for network failure 79 | 80 | Failure-Lambda has the ability to block network access to specified domains. This simulates a loss of connectivity which could be caused by a network outage or a service disruption at an endpoint. To inject network failure set the parameter to a value of the following: 81 | 82 | ```json 83 | { 84 | "isEnabled": true, 85 | "failureMode": "blacklist", 86 | "rate": 1, 87 | "blacklist": ["dynamodb.*.amazonaws.com"] 88 | } 89 | ``` 90 | 91 | This configuration will cause Failure-Lambda to, 100% of the time, disallow any network communication with the DynamoDB service in any AWS region. 92 | 93 | 1. Execute the network impaired Lambda function 94 | 95 | Return to the AWS Lambda console again and execute the Lambda function. What are the effects of the network issue? 96 | 97 | 1. Disable the Failure-Lambda wrapper 98 | 99 | To prepare for your first chaos experiment temporarily disable the Failure-Lambda wrapper. Set the parameter value back to the following to disable failure injection: 100 | 101 | ```json 102 | { 103 | "isEnabled": false 104 | } 105 | ``` 106 | 107 | ## Summary 108 | 109 | In this lab you learned about the Failure-Lambda NodeJS library and how it can be used to inject artificial failures and disruption into your Lambda functions. 110 | 111 | In [the next lab](lab_3_chaos_experiment.md) you will craft your first chaos experiment which will use the Failure-Lambda library to perturb your ETL architecture and observe the system's ability to perform in turbulent conditions. -------------------------------------------------------------------------------- /docs/lab_1_serverless_etl.md: -------------------------------------------------------------------------------- 1 | # Lab 1: Build a serverless ETL pipeline 2 | 3 | ## Overview 4 | 5 | In this lab you will use infrastructure-as-code tooling to deploy a serverless ETL pipeline into AWS. This pipeline is designed to accept JSON documents, convert them to CSV, and update a DynamoDB table with stats pulled from the data. The infrastructure-as-code will create an architecture similar to that shown below. 6 | 7 | ![Serverless ETL Architecture](images/serverless-etl-architecture.png) 8 | 9 | ETL Workflow: 10 | The process is started when a JSON document is put into the Chaos-Bucket. 11 | 12 | 1. S3 sends a notification of the new object to SQS 13 | 2a. SQS triggers Lambda to process the event 14 | 2b. IFF the Lambda fails to process the event the event is stored in a dead letter queue with the SQS service 15 | 3. The Lambda function updates DynamoDB with summary data of the record and a copy of the record itself 16 | 4. The Lambda function writes the tranformed file as a CSV back to the Chaos-Bucket 17 | 5. S3 sends a notification of the transformed object to SNS 18 | 6. SNS sends a copy of the event to SQS for retrieval by a downstream client or subscriber 19 | 20 | After deploying the architecture you will use drivers to publish and consume JSON and CSV files through the architecture. These drivers will run throughout the labs. 21 | 22 | ## Objectives 23 | - Observe the architecture and assess the applications steady state 24 | - Review the custom code in the AWS Lambda function 25 | - Determine the service level objectives you will use to measure your steady state 26 | 27 | --- 28 | 29 | ## Create the pipeline 30 | 31 | > Note: If you are using an AWS Cloud9 IDE you may need to disable [AWS managed credentials](https://docs.aws.amazon.com/cloud9/latest/user-guide/how-cloud9-with-iam.html#auth-and-access-control-temporary-managed-credentials). To disable AWS managed credentials open the **Preferences** pane for your IDE and disable `AWS managed credentials`. 32 | ![AWS managed credentials](images/aws_managed_credentials.png) 33 | 34 | 1. Using the Terraform cli, deploy the architecture 35 | 36 | ```bash 37 | $ cd terraform 38 | $ terraform init 39 | $ terraform apply -auto-approve 40 | ``` 41 | 42 | > If you are asked to specify a region use an AWS region code such as `us-east-2`, `eu-west-2`, or `ap-southeast-1`. 43 | 44 | ## Review the architecture 45 | 46 | 1. Review the ETL Lambda function via the [AWS Lambda console](https://console.aws.amazon.com/lambda/home?#/functions). 47 | 48 | The function will be named something like `ChaosTransformer-1234ABCD`. Make a note of the function's configuration, what causes it to execute, how long can it execute for, what environment variables does it have configured, etc. 49 | 50 | If you would like to review the code for your Lambda function you can see it in the `lambda.js` file in the `src` directory of this repository. 51 | 52 | 1. Review the DynamoDB table created using the [AWS DynamoDB console](https://console.aws.amazon.com/dynamodb/home?#tables:). 53 | 54 | The table will be named something like `chaos-data-1234ABCD`. It is unlikely to have any data in it yet as you have yet to put any files into the ETL pipeline. 55 | 56 | 1. Review the [SQS queues](https://console.aws.amazon.com/sqs/home?#) and [SNS topics](https://console.aws.amazon.com/sns/v3/home?#/topics) that were created. Note any triggers associated with the queues and topics. 57 | 58 | You will note that there are multiple SQS queues for holding new data file notifications (`chaos-json-work-queue-1234ABCD`), holding converted file notifications (`chaos-csv-work-queue-1234ABCD`), and a dead letter queue for any failed processing (`chaos-error-queue-1234ABCD`). 59 | 60 | There should be a single SNS topic named something like `chaos-csv-notification-topic-1234ABCD`. 61 | 62 | 1. Also visit the [S3 console](https://s3.console.aws.amazon.com/s3/home?) and find the bucket named something like `chaos-bucket-1234ABCD`. Notice the triggers that are configured to integrate S3 with SQS and SNS. 63 | 64 | ## Apply a load 65 | 66 | 1. To begin sending files through the pipeline execute the two driver programs provided for you in the `drivers` directory: 67 | 68 | ```bash 69 | $ cd ../drivers 70 | $ ./the_publisher.py & 71 | $ ./the_subscriber.py & 72 | ``` 73 | 74 | > **Note:** If you get a "botocore.exceptions.NoRegionError: You must specify a region." error message when executing the driver programs, you will need to configure your AWS CLI with `aws configure`, specifying the same region code as used above with Terraform. 75 | 76 | 1. Revisit some of the previous consoles for AWS Lambda, SQS, SNS, DynamoDB, and Amazon S3. 77 | 78 | You'll start to see the S3 bucket populated with files, items being stored into DynamoDB, and metrics generated by the Lambda function for every execution. Take a moment and review some of the information these consoles make available to you. 79 | 80 | ## Steady State 81 | 82 | Files are now being sent to Amazon S3, the entry point of your ETL pipeline. Upon landing in the S3 bucket the ETL Lambda function is being triggered to parse the received file, convert it to CSV, and write the CSV file back into the S3 bucket. When the CSV file lands in S3 the Amazon S3 service sends a notification to an SNS topic which has an SQS queue subscribed to the topic. 83 | 84 | When a file is encountered by the ETL Lambda function which it cannot parse it will experience an exception. S3 will invoke the Lambda function 2 more times in an effort to parse the file, if all 3 invocations experience an error the message will be stored into the dead letter queue configured for the Lambda function. 85 | 86 | To define and measure the steady state what metrics should be collected? And how should those metrics be used to produce indicators for the health of the ETL system? Is it helpful to measure the number of objects arriving into S3, or the number of Lambda errors that have occurred? What data do you need to produce [service level indicators](https://devopsinstitute.com/choosing-the-right-service-level-indicators/), and what service level indicators should be monitored? 87 | 88 | An [Amazon CloudWatch Dashboard](https://console.aws.amazon.com/cloudwatch/home?#dashboards:) has been created for this pipeline which tracks various metrics for the architecture and defines a few SLIs. Please use this to gain insight into the current state of the pipeline. 89 | 90 | ## Summary 91 | 92 | In this lab you used Infrastructure-as-Code to deploy a serverless ETL pipeline. You reviewed the various AWS consoles associated with the AWS services in use, and you also applied a load to the architecture using drivers to push files to and pull notifications from the ETL architecture. 93 | 94 | Leave the drivers running, in [the next lab](lab_2_inject_fault.md) you will begin to use chaos engineering tooling to learn how to inject failures into the serverless architecture. 95 | -------------------------------------------------------------------------------- /terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # 3 | # Outputs 4 | # 5 | ######################################### 6 | 7 | resource "local_file" "driver_variables" { 8 | filename = "${path.module}/../drivers/aws_resource_names.py" 9 | content = < "Chaos Engineering is the discipline of experimenting on a system in order to build confidence in the system's ability to withstand turbulent conditions in production." 29 | > 30 | > -- [Princple of Chaos Engineering](https://principlesofchaos.org/) 31 | 32 | Chaos engineering as a concept was developed by Greg Orzell in 2011 during his time working with Netflix ([Wikipedia](https://en.wikipedia.org/wiki/Chaos_engineering)). The practice is designed to test distributed, complex systems in order to assess how they cope under unusual operating conditions. 33 | 34 | Even when all of the individual services in a distributed system are functioning properly, the interactions between those services can cause unpredictable outcomes. Unpredictable outcomes, compounded by rare but disruptive real-world events that affect production environments, make these distributed systems inherently chaotic. 35 | 36 | Chaos engineering uses experiments to test the impact of potential failure modes on a distributed, complex application. To start you must first establish the steady state for an application. The steady state should be a quantifiable, measurable state of an application which is achieved when the application is delivering to your stated service level objectives. Once the steady state is determined develop experiments to introduce chaos or perturbations into the system, with the hypothesis that this disruption will not affect the steady state of the application, that the system will cope with the disruption. During the experiment's execution observe the application and, if the hypothesis is found to be false, use your observations to improve the system to better cope with the turbulent conditions created. 37 | 38 | There are numerous tools and techniques for conducting chaos experiments on different types of architecture. What follows are a set of labs to use two such tools to develop and execute chaos experiments on a serverless ETL application. 39 | 40 | Let's now download and deploy a serverless application that we can iterate on and improve through the process of chaos engineering. 41 | 42 | ## Prerequisites 43 | 44 | > Note: If you are running this from an AWS Cloud9 IDE you will not have all of the permissions you need to deploy this architecture. Disable the AWS managed temporary credentials and [configure an EC2 instance profile](https://docs.aws.amazon.com/cloud9/latest/user-guide/credentials.html#credentials-temporary) for your Cloud9 system. 45 | 46 | 1. Clone the repository locally. 47 | 48 | ```bash 49 | $ git clone https://github.com/jpbarto/serverless-chaos-lab.git 50 | $ cd serverless-chaos-lab 51 | ``` 52 | 53 | ### Repository Contents 54 | ```shell 55 | ├── README.md # Introduction (this doc) 56 | ├── docs # Lab guides 57 | │ ├── images 58 | │ ├── lab_1_serverless_etl.md 59 | │ ├── lab_2_inject_fault.md 60 | │ ├── lab_3_chaos_experiment.md 61 | │ └── lab_4_chaos_experiment_2.md 62 | ├── drivers 63 | │ ├── the_publisher.py # Publication driver for pipeline 64 | │ └── the_subscriber.py # Subscription driver for pipeline 65 | └── src 66 | │ ├── lambda.js # NodeJS code for ETL Lambda 67 | │ └── package.json # ETL dependencies 68 | └── terraform # Terraform templates for a Serverless ETL pipeline 69 | ├── application.tf # deploys the Lambda function 70 | ├── database.tf # deploys a DynamoDB table 71 | ├── filestore.tf # creates the S3 bucket 72 | ├── messaging.tf # creates a collection of queues and topics 73 | ├── monitoring.tf # creates a CloudWatch Dashboard 74 | ├── outputs.tf # creates identifier files for the drivers and chaos experiments 75 | └── template.tf # wrapper around the above 76 | ``` 77 | 78 | 1. If Terraform is not already installed, [install HashiCorp Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) in your local environment. 79 | > Note: These labs require Terraform v12 or higher. 80 | 81 | 82 | If you are using an [AWS Cloud9 IDE](https://aws.amazon.com/cloud9/) instance the following should install Terraform for you: 83 | ```bash 84 | $ sudo yum install -y yum-utils 85 | $ sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo 86 | $ sudo yum -y install terraform 87 | ``` 88 | 89 | 1. To package the Lambda function, Terraform will need to use NPM and NodeJS. If NodeJS and NPM are not already installed please [install them](https://nodejs.org/en/download/) into your environment. 90 | 91 | On an AWS Cloud9 IDE use the following: 92 | ```bash 93 | $ sudo yum -y install npm 94 | ``` 95 | 96 | 1. In these labs you will use the [ChaosToolkit](https://chaostoolkit.org/) to script your chaos experiments. [Install the ChaosToolkit](https://docs.chaostoolkit.org/reference/usage/install/) using `pip3`: 97 | 98 | ```bash 99 | $ sudo pip3 install chaostoolkit chaostoolkit-aws 100 | ``` 101 | 102 | ## Labs 103 | 104 | With the above completed you're now ready to embark on a series of hands-on labs to learn about chaos engineering. Let's get started! 105 | 106 | 1. [Deploy a serverless ETL pipeline](docs/lab_1_serverless_etl.md) 107 | 108 | Using Hashicorp Terraform deploy a serverless pipeline for converting JSON documents to CSV. In this lab you will also provide a publisher of JSON documents to this pipeline and simulate a consumer of the pipeline in order to generate metrics data that is viewable using a CloudWatch metrics dashboard. 109 | 110 | (Key objective: observe the architecutre and observe how the pipeline performs during standard operation, review the Lambda function, define SLO) 111 | 112 | 1. [Inject fault into the pipeline](docs/lab_2_inject_fault.md) 113 | 114 | In this lab you will modify the Lambda function to incorporate the failure-lambda NodeJS package. With the package enabled you will test the function to observe how it responds to changes in the package's configuration. 115 | 116 | (Key objective: modify Lambda function and inject failure, observe functions behavior during injected failures) 117 | 118 | 1. [Author a chaos experiment](docs/lab_3_chaos_experiment.md) 119 | 120 | In this lab you will learn how to create a chaos experiment and execute it using the open source Chaos Toolkit. In executing the experiment you will uncover a flaw in the architecture's design. After correcting the flaw you should be able to re-run the experiment and see that steady state is maintained. 121 | 122 | (Key objective: Using Chaos Toolkit define an experiment and execute to inject latency, fix the architecture to maintain SLO in the face of latency) 123 | 124 | 1. [Simulate a serverless service disruption](docs/lab_4_chaos_experiment_2.md) 125 | 126 | In this lab you will introduce a different type of fault to simulate a permissions issue or a service outtage. 127 | 128 | (Key objective: Using Chaos Toolkit define an experiment and execute it to simulate a service disruption.) 129 | 130 | ## FAQ 131 | 1. **When running the Chaos Toolkit the probes fail even without disrupting the system, what is happening?** 132 | 133 | Some of the probes depend upon your system's local `date` command in order to obtain a datetime stamp to query CloudWatch metrics. The current command being used assumes a Linux-based environment. However if you are on a BSD-based environment, such as OSX, you will need to alter the `date` commands in your experiment JSON. Replace `date --date '5 min ago'` with `date -v -5M`. 134 | 135 | For example: 136 | 137 | ```json 138 | { 139 | "type": "probe", 140 | "name": "messages-in-flight", 141 | "tolerance": { 142 | "type": "range", 143 | "range": [0.0, 80.0], 144 | "target": "stdout" 145 | }, 146 | "provider": { 147 | "type": "process", 148 | "path": "aws", 149 | "arguments": "cloudwatch get-metric-data --metric-data-queries file://steadyStateFlight.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]'" 150 | } 151 | }, 152 | ``` 153 | 154 | 1. **My `date` command is correct but the Chaos Toolkit probes still fail without disruption, what is happening?** 155 | 156 | It's unclear why but some configurations of the Python ecosystem and AWS CLI seem to have a detrimental effect on the `get-metrics-data` call to AWS CloudWatch metrics. When querying the API an empty data set is returned. This has been known to be the case on OSX Catalina with AWS CLI v1 and v2. 157 | 158 | 1. **After a failed experiment, I fix the issue and rerun the experiment but it still fails, what is happening?** 159 | 160 | After a failed experiment the Chaos Toolkit will rollback changes to allow the system to resume its steady state. However the system will not return to steady state instantaneously, it can take as much as 15 min for the system to return to its steady state and be ready for more testing. You *may* be able to accelerate this time to recovery by purging the SQS queues. 161 | 162 | 1. **The Chaos Toolkit isn't executing properly but I don't know why, how do I troubleshoot?** 163 | 164 | The Chaos Toolkit will write detailed output to `chaostoolkit.log` and a summary of the metrics it tracks to `journal.json`. You can look through these files for any clues as to what went wrong. You can also specify the `--verbose` flag upon execution to get more detailed output to the console: `chaos --verbose run experiment.json`. 165 | -------------------------------------------------------------------------------- /docs/lab_4_chaos_experiment_2.md: -------------------------------------------------------------------------------- 1 | # Lab 4: Experiment with Service Disruption 2 | 3 | ## Objective 4 | 5 | In this lab you will experiment with a different failure mode that could effect your application. The application relies on the DynamoDB service, lets inject intermittent connectivity to the service and observe how the overall application responds. 6 | 7 | ## Service Availability 8 | 9 | In the last lab you created your first chaos experiment using the Chaos Toolkit. In this lab you will explore a different failure mode and examine how the application performs. The failure mode we want to explore in this lab is the waivering availability of a dependency. We cannot temporarily disrupt the DynamoDB service however we can temporarily disrupt connectivity to the service. 10 | 11 | To simulate a service disruption we will again use the failure-lambda library's `blacklist` feature to block access to the DyanmoDB API some percentage of the time. 12 | 13 | ## The Next Experiment 14 | 15 | 1. Start your experiment definition 16 | 17 | Lets start as we did last time by creating a skeleton template for this experiment. Create a file named `exp_2-dynamodb_disruption.json` with the following contents: 18 | 19 | ```json 20 | { 21 | "version":"1.0.0", 22 | "title":"Dependency disruption should not impact processing", 23 | "description":"Disrupt access to the DynamoDB service and ensure files are still processed.", 24 | "tags":[ 25 | "serverless", 26 | "cloudnative", 27 | "etl" 28 | ], 29 | "configuration":{ 30 | "s3_bucket":{ 31 | "type":"env", 32 | "key":"S3_BUCKET_NAME" 33 | }, 34 | "sns_topic":{ 35 | "type":"env", 36 | "key":"SNS_TOPIC_NAME" 37 | }, 38 | "lambda_function":{ 39 | "type":"env", 40 | "key":"LAMBDA_FUNCTION_NAME" 41 | } 42 | }, 43 | "steady-state-hypothesis":{ 44 | "title":"System operating within norms", 45 | "probes":[ 46 | { 47 | "type":"probe", 48 | "name":"zero-sns-errors", 49 | "tolerance":0, 50 | "provider":{ 51 | "type":"python", 52 | "module":"chaosaws.cloudwatch.probes", 53 | "func":"get_metric_statistics", 54 | "arguments":{ 55 | "namespace":"AWS/SNS", 56 | "metric_name":"NumberOfNotificationsFailed", 57 | "dimension_name":"TopicName", 58 | "dimension_value":"${sns_topic}", 59 | "statistic":"Sum", 60 | "duration":900 61 | } 62 | } 63 | }, 64 | { 65 | "type":"probe", 66 | "name":"messages-in-flight", 67 | "tolerance":{ 68 | "type":"range", 69 | "range":[ 70 | -10.0, 71 | 50.0 72 | ], 73 | "target":"stdout" 74 | }, 75 | "provider":{ 76 | "type":"process", 77 | "path":"aws", 78 | "arguments":"cloudwatch get-metric-data --metric-data-queries file://steadyStateFlight.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]'" 79 | } 80 | }, 81 | { 82 | "type":"probe", 83 | "name":"normal-error-rates", 84 | "tolerance":{ 85 | "type":"range", 86 | "range":[ 87 | 0.0, 88 | 5.0 89 | ], 90 | "target":"stdout" 91 | }, 92 | "provider":{ 93 | "type":"process", 94 | "path":"aws", 95 | "arguments":"cloudwatch get-metric-data --metric-data-queries file://steadyStateError.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]'" 96 | } 97 | } 98 | ] 99 | }, 100 | "method":[ 101 | ], 102 | "rollbacks":[ 103 | { 104 | "type":"action", 105 | "name":"Disable Lambda failures", 106 | "provider":{ 107 | "type":"process", 108 | "path":"aws", 109 | "arguments":"ssm put-parameter --name failureLambdaConfig --type String --overwrite --value '{\"isEnabled\": false}'" 110 | } 111 | } 112 | ] 113 | } 114 | ``` 115 | 116 | Everything in this template is the same as last time, you have the same steady state definition, the same rollback. The title and description are updated to reflect the nature of the experiment however. 117 | 118 | 1. Actions 119 | 120 | To simulate a service disruption with DynamoDB you will use the `blacklist` feature of the failure-lambda library. Similar to the last experiment, define an action which configures the failure-lambda library to block access to any URL which matches the DynamoDB endpoint URI 50% of the time. 121 | 122 | ```json 123 | "method":[ 124 | { 125 | "type":"action", 126 | "name":"Enable Lambda failure: BLACKLIST", 127 | "provider":{ 128 | "type":"process", 129 | "path":"aws", 130 | "arguments":"ssm put-parameter --name failureLambdaConfig --type String --overwrite --value '{\"isEnabled\": true, \"failureMode\": \"blacklist\", \"rate\": 0.5, \"blacklist\": [\"dynamodb.*.amazonaws.com\"]}'" 131 | }, 132 | "pauses":{ 133 | "after": 300 134 | } 135 | } 136 | ], 137 | ``` 138 | 139 | 1. Execute the experiment 140 | 141 | Now run the experiment. As last time it will modify the failure-lambda configuration and then wait 5 minutes before re-evaluating the steady state. 142 | 143 | In this instance the percentage of messages in flight should be outside of tolerance. 144 | 145 | ## What have we learned? 146 | 147 | 1. Understanding the results 148 | 149 | Your experiment is configured to check a number of service level indicators to determine if the application is working normally. The failed metric tracks the percentage of messages that are currently being processed by your ETL pipeline. To see this metric for yourself you can execute the following AWS CLI command from within the `chaos` directory: 150 | 151 | > Note that the `date` command below assumes the Linux operating system. 152 | 153 | ```bash 154 | $ aws cloudwatch get-metric-data --metric-data-queries file://steadyStateFlight.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]' 155 | ``` 156 | 157 | As an alternative to the CLI you can visit your [CloudWatch metrics dashboard](https://console.aws.amazon.com/cloudwatch/home?#dashboards:) and look at the `Percent in Flight`. Whether through the CLI or the AWS Console you should see that the Percentage in Flight is well over 20%. You'll recall that this metric is calculated by subtracing the messages posted to the processed CSV SQS queue, and the records written to the DynamoDB table, from the messages posted to the JSON SQS queue and then dividing by the number of messages in the JSON queue. 158 | 159 | > ( ( (2*JSON messages) - (CSV messages + DynamoDB Writes) ) / (2*JSON messages) ) * 100 160 | 161 | Or alternatively 162 | 163 | > ( ( ( Messages In - Messages Out ) + ( Messages In - Records Stored ) ) / ( 2 * Messages In ) ) * 100 164 | 165 | The idea is that the number of messages in should always be, within tolerance, equal to the number of messages out, and the number of records written to the database. 166 | 167 | In this experiement the number of messages in flight has grown wildly beyond normal expected tolerances, and even though half of the Lambda functions executing are not able to communicate with the DynamoDB service the error rate has remained within norms. What is going on? 168 | 169 | 1. Behavior explained 170 | 171 | Diving into the metrics you'll notice that the number of messages going into the Output queue have remained roughly on par with the Input queue, but the number of DynamoDB Updates is roughly half; this would be inline with the 50% intermittent connectivity rate to DynamoDB. But if the Lambda is unable to communicate with DynamoDB why isn't an error getting raised? 172 | 173 | If you visit the Monitoring tab of the Lambda function and scroll down to the list of the most expensive invocations these will likely be one of the executions that had difficulty connecting to DynamoDB. To review the log entries copy the RequestID and click the LogStream link for the request. On the CloudWatch Logs console, in the Filter Events search field paste the RequestID in quotes to view only those log entries that relate to the execution. Along with the normal execution messages you should see messages such as the following which show the Lambda was unable to connect to DynamoDB: 174 | 175 | ``` 176 | 7b2c3ad9-0e0c-5c3c-83e7-6c2de114e600 INFO Intercepted network connection to dynamodb.us-east-2.amazonaws.com 177 | ``` 178 | 179 | If you look at the source code you'll notice that the Lambda should be outputting to the log when it has successfully written records to DynamoDB. Due to the asynchronous nature of NodeJS the calls to DynamoDB (and S3) are being performed asynchronously. This provides a tremendous performance advantage, an event-driven code base, but it also means that the Lambda may be leaving threads, which are still executing, behind when it exits. 180 | 181 | ## Iterate and improve 182 | 183 | 1. Fix it 184 | 185 | Looking at the source code for the Lambda you'll notice that the function ends with a `return` statement. This returns control back to the Lambda system without waiting for the NodeJS event loop to be empty. If this line is changed to use the `callback` function Lambda will then wait for the event loop to be empty, ie for the DyanmoDB calls to fail or succeed, before exiting the Lambda. In this way the Lambda will continue to try to communicate with DynamoDB until the function times out, causing an error that is tracked by the Site Reliability Objectives. 186 | 187 | Modify the `lambda.js` to replace the return statement around line 133 with the following: 188 | 189 | ```javascript 190 | callback (null, response); 191 | ``` 192 | 193 | Reapply your Terraform template to push the source code change into AWS. 194 | 195 | 1. Break it again 196 | 197 | Now re-run your Chaos experiment and notice that the experiment still fails but now it fails because the error rate is unnacceptably high. How could you improve the architecture to better account for this situation? 198 | 199 | 200 | ## Summary 201 | 202 | You have now concluded this workshop. You have used Chaos-Toolkit and failure-lambda to develop and execute chaos experiments on a serverless architecture on AWS. There are many more experiments which can be performed on this architecture to improve it, but how will you now use this informaiton to improve your own serverless architecture? 203 | -------------------------------------------------------------------------------- /docs/lab_3_chaos_experiment.md: -------------------------------------------------------------------------------- 1 | # Lab 3: Your first Chaos Experiment 2 | 3 | ## Objective 4 | 5 | In this lab you will learn about [Chaos Toolkit](https://chaostoolkit.org/) and script it to test your serverless architecture. You will cover how to design a chaos experiment and then put it into practice to improve your ETL pipeline. 6 | 7 | ## Chaos Toolkit 8 | 9 | The [Chaos Toolkit](https://chaostoolkit.org/) aims to be the simplest and easiest way to explore building your own Chaos Engineering Experiments. It also aims to define a vendor and technology independent way of specifying chaos engineering experiments by providing an open API. 10 | 11 | It uses a declaritive and extensible format for specifying and scripting chaos experiments. This allows you to automate chaos engineering and incorporate experiments into your CI/CD pipelines. 12 | 13 | The toolkit also has been [extended](https://chaostoolkit.org/extensions) to allow it to support, out of the box, the ability to interact with major cloud computing providers, Kubernetes, Spring, Spring Boot, and many others. 14 | 15 | To get started learning more about Chaos Toolkit please visit their [documentation](https://docs.chaostoolkit.org/). 16 | 17 | During this lab you will define your first chaos experiment to improve your serverless architecture. As part of this you will define a *steady state hypothesis* which defines how the system will measure whether the ETL pipeline is behaving normally. You will also define a *method* to inject failures into the architecture. Chaos Toolkit will begin by evaluating the steady state of your architecture, provided that the architecture passes, the toolkit will then execute your method. After the chaos has been injected the toolkit will re-evaluate the steady state and report whether the system performed as expected or if an error was detected. 18 | 19 | ## Failure modes 20 | 21 | Take a moment and consider the many ways that your ETL architecture could go wrong. What sort of adverse conditions can you imagine that may affect the pipeline? What effect would they have? How would you measure the impact of these effects and how would you define whether your pipeline was still successful even if impaired? 22 | 23 | ## Your first experiment 24 | 25 | > Note: Ensure that the drivers are still running and applying a load to your application. The chaos experiments will rely on the drivers to demonstrate the ETL pipeline's ability to perform in turbulent conditions. 26 | 27 | 1. Prepare for your first experiment 28 | 29 | The Chaos Toolkit will need to interact with your architecture and so you will need to give it the names of components such as your Lambda function and SQS queues. These have been exported by Terraform from the first lab and can be found in the `chaos/aws_resource_names.sh` file. If you inspect the file you will see that it defines a number of environment variables. Let's change into the `chaos` directory and source the environment variables to get started. 30 | 31 | ```bash 32 | $ cd chaos 33 | $ source aws_resource_names.sh 34 | ``` 35 | 36 | ## Define the experiement 37 | 38 | 1. Create your experiment's skeleton 39 | 40 | Chaos Toolkit experiments are defined as JSON files. A detailed breakdown is available [online](https://docs.chaostoolkit.org/reference/api/experiment/) but for now create the beginnings of your experiment by creating a file named `exp_1-minor_delay.json` with the following contents: 41 | 42 | ```json 43 | { 44 | "version": "1.0.0", 45 | "title": "Minor delay should not impact processing", 46 | "description": "Inject latency into Lambda function execution and ensure files are still processed.", 47 | "tags": [ 48 | "serverless", 49 | "cloudnative", 50 | "etl" 51 | ], 52 | "configuration": { 53 | "s3_bucket": { 54 | "type": "env", 55 | "key": "S3_BUCKET_NAME" 56 | }, 57 | "sns_topic": { 58 | "type": "env", 59 | "key": "SNS_TOPIC_NAME" 60 | }, 61 | "lambda_function": { 62 | "type": "env", 63 | "key": "LAMBDA_FUNCTION_NAME" 64 | } 65 | }, 66 | "steady-state-hypothesis": { 67 | "title": "System operating within norms", 68 | "probes": [ 69 | ] 70 | }, 71 | "method": [ 72 | ], 73 | "rollbacks": [ 74 | ] 75 | } 76 | ``` 77 | 78 | A lot of the above is boiler plate and placeholders which will soon be completed in the steps to follow. One comment regarding the `configuration` section, it pulls in the environment variables defined in step 1 so they can be referenced in the rest of the experiment definition. For more about the `configuration` section please see the [documentation](https://docs.chaostoolkit.org/reference/api/experiment/#configuration). 79 | 80 | 1. Define a steady state 81 | 82 | Take a moment and consider what you think would be good, measurable indicators that demonstrate the ETL pipeline is executing as expected. Would you measure the number of errors produced? How many errors would you accept before notifying someone and considering the system to be in an error state? 83 | 84 | Would you measure how many messages are processed per second? If the system encountered a slowdown and files were processed in 5 minutes rather than 5 seconds, is this breaching any SLAs you maintain with your clients? 85 | 86 | What defines a steady state for any given application is very specific to the application itself. But for our purpose today we will measure three aspects of the architecture: 87 | 88 | 1. The number of errors experienced by SNS 89 | 1. The % of how many messages are currently being processed by the pipeline 90 | 1. The % of how many messages have experienced an error and are in the dead letter queue 91 | 92 | To mesaure the number of SNS errors we can simply create a probe which queries AWS CloudWatch. 93 | 94 | To measure the other two we will still use CloudWatch as our data source but we will need some simple mathematics to calculate the percentages that demonstrate steady state. There should be two files in your chaos folder, one for the in flight message calculation and one for the error rate calculation. Have a look at their source but we will pass them to the AWS CLI as another probe to be used by Chaos Toolkit to evaluate our steady state. 95 | 96 | All three probes are defined below. Update the `exp_1-minor_delay.json` file by adding this `probes` definition to the `steady-state-hypothesis` of your experiment. 97 | 98 | ```json 99 | "probes": [ 100 | { 101 | "type": "probe", 102 | "name": "zero-sns-errors", 103 | "tolerance": 0, 104 | "provider": { 105 | "type": "python", 106 | "module": "chaosaws.cloudwatch.probes", 107 | "func": "get_metric_statistics", 108 | "arguments": { 109 | "namespace": "AWS/SNS", 110 | "metric_name": "NumberOfNotificationsFailed", 111 | "dimension_name": "TopicName", 112 | "dimension_value": "${sns_topic}", 113 | "statistic": "Sum", 114 | "duration": 900 115 | } 116 | } 117 | }, 118 | { 119 | "type": "probe", 120 | "name": "messages-in-flight", 121 | "tolerance": { 122 | "type": "range", 123 | "range": [-10.0, 80.0], 124 | "target": "stdout" 125 | }, 126 | "provider": { 127 | "type": "process", 128 | "path": "aws", 129 | "arguments": "cloudwatch get-metric-data --metric-data-queries file://steadyStateFlight.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]'" 130 | } 131 | }, 132 | { 133 | "type": "probe", 134 | "name": "normal-error-rates", 135 | "tolerance": { 136 | "type": "range", 137 | "range": [0.0, 5.0], 138 | "target": "stdout" 139 | }, 140 | "provider": { 141 | "type": "process", 142 | "path": "aws", 143 | "arguments": "cloudwatch get-metric-data --metric-data-queries file://steadyStateError.json --start-time `date --date '5 min ago' -u '+%Y-%m-%dT%H:%M:%SZ'` --end-time `date -u '+%Y-%m-%dT%H:%M:%SZ'` --query 'MetricDataResults[0].Values[0]'" 144 | } 145 | } 146 | ] 147 | ``` 148 | 149 | 1. Evaluate the steady state 150 | 151 | You now have the beginnings of your experiment. Execute Chaos Toolkit with your definition and watch its output as it assesses the steady state of your application. 152 | 153 | ```bash 154 | $ chaos run exp_1-minor_delay.json 155 | ``` 156 | 157 | You will notice that the Chaos Toolkit evaluates all 3 of the probes and reports whether they fail or succeed. With the steady state satisfied it then moves on to the actions defined. Finding none defined it re-evaluates steady state, reports that all is well and then exits. 158 | 159 | Let's change that. 160 | 161 | ## Execute your experiment 162 | 163 | 1. Introduce some chaos 164 | 165 | The [method section](https://docs.chaostoolkit.org/reference/api/experiment/#method) of an experiment defines the step(s) to take in order to introduce turbulence into the system. The method section is a list of actions and probes which you define. 166 | 167 | Lets now introduce a minor latency of 3 to 5 seconds to the Lambda function. 168 | 169 | Update your experiment definition with the following action. It will modify the configuration parameter for the failure-lambda library causing the Lambda function to, 50% of the time, take 3 to 5 seconds longer to execute. After modifying the Lambda functions configuration the ChaosToolkit experiment will pause for 5 min before re-evaluating the steady state of the application. 170 | 171 | ```json 172 | "method": [ 173 | { 174 | "type": "action", 175 | "name": "Enable Lambda failure: LATENCY", 176 | "provider": { 177 | "type": "process", 178 | "path": "aws", 179 | "arguments": "ssm put-parameter --name failureLambdaConfig --type String --overwrite --value '{\"isEnabled\": true, \"failureMode\": \"latency\", \"rate\": 0.5, \"minLatency\": 3000, \"maxLatency\": 5000}'" 180 | }, 181 | "pauses": { 182 | "after": 300 183 | } 184 | } 185 | ], 186 | ``` 187 | 188 | 1. Experiment responsibly 189 | 190 | When the experiment has completed you will want to remove the turbulence you introduced into the system so that your application can resume normal operation. The [rollbacks section](https://docs.chaostoolkit.org/reference/api/experiment/#rollbacks) is designed to return the application to its initial state after the experiment has completed. 191 | 192 | Enter the following rollback section in order to disable the failure-lambda package: 193 | 194 | ```json 195 | "rollbacks": [ 196 | { 197 | "type": "action", 198 | "name": "Disable Lambda failures", 199 | "provider": { 200 | "type": "process", 201 | "path": "aws", 202 | "arguments": "ssm put-parameter --name failureLambdaConfig --type String --overwrite --value '{\"isEnabled\": false}'" 203 | } 204 | } 205 | ] 206 | ``` 207 | 208 | 1. Re-run your experiment 209 | 210 | So at this point you have configured an experiment which defines your steady state, the actions needed to introduce chaos, and the actions needed to rollback and revert to the initial state. Now re-run your experiment and observe the results. 211 | 212 | ```bash 213 | $ chaos run exp_1-minor_delay.json 214 | ``` 215 | 216 | After updating the failure-lambda configuration the Chaos Toolkit will wait 5 min to allow the system time to respond to the disruption. It should fail citing too high an error rate, rolling back the changes to resume normal operations. 217 | 218 | ## What has been learned? 219 | 220 | 1. Enhance the system 221 | 222 | The simple introduction of 3 to 5 seconds of latency caused a huge spike in the error rate of the application. Why would such a small latency cause such a huge issue? How can we make the system more resilient to this latency? 223 | 224 | Use [CloudWatch Logs Insights](https://console.aws.amazon.com/cloudwatch/home?#logs-insights:queryDetail=~(end~0~start~-3600~timeType~'RELATIVE~unit~'seconds~editorString~'filter*20*40type*20*3d*20*22REPORT*22*20*7c*0afields*20*40requestId*2c*20*40billedDuration*20*7c*0asort*20by*20*40billedDuration*20desc~isLiveTail~false~queryId~'363f4d7c-5725-4efd-9f64-99d93da45f1f~source~)) and the `Monitoring` tab of your function on the [AWS Lambda console](https://console.aws.amazon.com/lambda/home?#/functions) to dive into the logs and metrics data regarding your Lambda function. What do you notice about the Lambda function that may explain the error rate? How is the increased latency reflected in the data you have about the function's performance? 225 | 226 | You should notice that the average function duration is less than 500 milliseconds but then peaks at 3000 milliseconds. But if the latency was as much as 5 seconds, why is that not reflected? A closer look at the function's configuration will reveal a `Timeout` of 3 seconds, hence the function was being terminated after 3 seconds of execution, hence the increased error rate. 227 | 228 | To correct this, update the Terraform `application.tf` and set the `timeout` value for the `aws_lambda_function` resource to have a value of 120 seconds. With this file changed, reapply the Terraform code to push the change. 229 | 230 | ```bash 231 | $ cd ../terraform 232 | $ terraform apply 233 | ``` 234 | 235 | The `Timeout` of the Lambda function should now have a value of 2 minutes. 236 | 237 | 1. Re-re-run your experiment 238 | 239 | With the runtime of the Lambda function updated re-run your chaos experiment to ensure that the system is now able to cope with the minor latency. 240 | 241 | ```bash 242 | $ cd ../chaos 243 | $ chaos run exp_1-minor_delay.json 244 | ``` 245 | 246 | ## Summary 247 | 248 | In this lab you created your first chaos experiment using the Chaos Toolkit. You injected latency into the Lambda function which processes incoming JSON file which uncovered an inability of the Lambda's configuration to cope with a minor latency. To fix the issue you extended the Lambda's runtime to allow for 2 minutes of execution. 249 | 250 | Let's now [define another experiment](lab_4_chaos_experiment_2.md) to continue testing the application. 251 | --------------------------------------------------------------------------------