├── dynamodb-traffic-blackhole-region-impairment ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── dynamodb-traffic-blackhole-region-impairment-template.json ├── dynamodb-traffic-blackhole-region-impairment-iam-policy.json └── README.md ├── templates ├── AWSFIS.json ├── example-iam-trust-relationship.json ├── example-iam-policy.json └── README.md ├── ec2-windows-stop-iis ├── images │ └── ssm.png ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── ec2-windows-stop-iis-template.json ├── ec2-windows-stop-iis-iam-policy.json ├── README.md └── ec2-windows-stop-iis-ssm-template.json ├── sqs-queue-impairment ├── images │ └── sqs.png ├── AWSFIS.json ├── ssm-iam-trust-relationship.json ├── fis-iam-trust-relationship.json ├── sqs-queue-impairment-tag-based-ssm-automation-role-iam-policy.json ├── sqs-queue-impairment-tag-based-fis-role-iam-policy.json ├── sqs-queue-impairment-tag-based-experiment-template.json ├── README.md └── sqs-queue-impairment-tag-based-automation.yaml ├── cloudfront-impairment ├── AWSFIS.json ├── images │ ├── experiment-workflow.png │ └── cloudfront-impairment-architecture.png ├── ssm-iam-trust-relationship.json ├── fis-iam-trust-relationship.json ├── cloudfront-impairment-tag-based-ssm-automation-role-iam-policy.json ├── cloudfront-impairment-tag-based-fis-role-iam-policy.json └── cloudfront-impairment-tag-based-experiment-template.json ├── ec2-spot-interruption ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── ec2-spot-interruption-template.json ├── ec2-spot-interruption-iam-policy.json └── README.md ├── aurora-cluster-failover ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── aurora-cluster-failover-template.json ├── aurora-cluster-failover-iam-policy.json └── README.md ├── dynamodb-region-impairment ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── ssm-iam-trust-relationship.json ├── dynamodb-region-impairment-ssm-automation-role-iam-policy.json ├── fis-role-policy.json ├── dynamodb-region-impairment-fis-role-iam-policy.json ├── dynamodb-region-impairment-experiment-template.json ├── README.md └── dynamodb-region-impairment-automation.yaml ├── ec2-instances-terminate ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── ec2-instances-terminate-iam-policy.json ├── ec2-instances-terminate-template.json └── README.md ├── sap-ec2-instance-stop-ascs ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── sap-ec2-instance-stop-sap-ascs-template.json ├── sap-ec2-instance-stop-sap-policy.json └── README.md ├── sap-ebs-pause-database-data ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── sap-ebs-pause-database-data-policy.json ├── sap-ebs-pause-database-data-template.json └── README.md ├── sap-ec2-instance-stop-database ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── sap-ec2-instance-stop-sap-database-template.json ├── sap-ec2-instance-stop-sap-database-policy.json └── README.md ├── aurora-global-region-failover ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── ssm-iam-trust-relationship.json ├── aurora-global-region-failover-ssm-automation-role-iam-policy.json ├── aurora-global-region-failover-fis-role-iam-policy.json ├── aurora-global-region-failover-experiment-template.json └── README.md ├── mysql-rds-loadtest-failover ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── mysql-rds-loadtest-failover-iam-policy.json ├── mysql-rds-loadtest-failover-template.json └── README.md ├── aurora-postgres-cluster-loadtest-failover ├── AWSFIS.json ├── fis-iam-trust-relationship.json ├── aurora-postgres-cluster-loadtest-failover-iam-policy.json ├── aurora-postgres-cluster-loadtest-failover-template.json ├── README.md └── aurora-postgres-cluster-loadtest-failover-ssm-template.json ├── .gitignore ├── elasticache-redis-connection-failure ├── fis-iam-trust-relationship.json ├── ssm-iam-trust-relationship.json ├── redis-connection-failure-ssm-role-iam-policy.json ├── redis-connection-failure-experiment-template.json ├── redis-connection-failure-fis-role-iam-policy.json ├── README.md └── redis-connection-failure-automation.yaml ├── elasticache-redis-primary-node-failover ├── fis-iam-trust-relationship.json ├── ssm-iam-trust-relationship.json ├── elasticache-redis-primary-node-failover-ssm-role-iam-policy.json ├── elasticache-redis-primary-node-failover-experiment-template.json ├── elasticache-redis-primary-node-failover-fis-role-iam-policy.json ├── elasticache-redis-primary-node-failover-automation.json └── README.md ├── elasticache-redis-primary-node-reboot ├── fis-iam-trust-relationship.json ├── ssm-iam-trust-relationship.json ├── elasticache-node-primary-node-reboot-ssm-role-iam-policy.json ├── elasticache-redis-primary-node-reboot-experiment-template.json ├── elasticache-redis-primary-node-reboot-fis-role-iam-policy.json ├── README.md └── elasticache-redis-primary-node-reboot-automation.yaml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md └── CONTRIBUTING.md /dynamodb-traffic-blackhole-region-impairment/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "templateVersion": "2020-09-01" 3 | } 4 | -------------------------------------------------------------------------------- /templates/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /ec2-windows-stop-iis/images/ssm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/ec2-windows-stop-iis/images/ssm.png -------------------------------------------------------------------------------- /sqs-queue-impairment/images/sqs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/sqs-queue-impairment/images/sqs.png -------------------------------------------------------------------------------- /cloudfront-impairment/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /ec2-spot-interruption/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /ec2-windows-stop-iis/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /aurora-cluster-failover/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /dynamodb-region-impairment/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /ec2-instances-terminate/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /sap-ec2-instance-stop-ascs/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /sqs-queue-impairment/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /sap-ebs-pause-database-data/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /sap-ec2-instance-stop-database/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /aurora-global-region-failover/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /mysql-rds-loadtest-failover/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/AWSFIS.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSFIS": { 3 | "template": { 4 | "version": "1.0" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /cloudfront-impairment/images/experiment-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/cloudfront-impairment/images/experiment-workflow.png -------------------------------------------------------------------------------- /cloudfront-impairment/images/cloudfront-impairment-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/cloudfront-impairment/images/cloudfront-impairment-architecture.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.js 2 | !jest.config.js 3 | *.d.ts 4 | node_modules 5 | .aws-sam 6 | .vscode 7 | samconfig.toml 8 | .DS_Store 9 | 10 | # CDK asset staging directory 11 | .cdk.staging 12 | cdk.out 13 | .idea/ 14 | -------------------------------------------------------------------------------- /cloudfront-impairment/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /sqs-queue-impairment/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /dynamodb-traffic-blackhole-region-impairment/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /templates/example-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /aurora-cluster-failover/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /ec2-instances-terminate/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /ec2-spot-interruption/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /ec2-windows-stop-iis/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /cloudfront-impairment/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /sap-ebs-pause-database-data/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /sap-ec2-instance-stop-ascs/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /sqs-queue-impairment/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /mysql-rds-loadtest-failover/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /sap-ec2-instance-stop-database/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /aurora-global-region-failover/fis-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "fis.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /aurora-global-region-failover/ssm-iam-trust-relationship.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ssm.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /aurora-global-region-failover/aurora-global-region-failover-ssm-automation-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "rds:DescribeGlobalClusters", 8 | "rds:FailoverGlobalCluster" 9 | ], 10 | "Resource": "*" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-ssm-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "elasticache:DescribeReplicationGroups", 8 | "elasticache:ListTagsForResource", 9 | "elasticache:ModifyReplicationGroup" 10 | ], 11 | "Resource": "*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "sts:GetCallerIdentity" 17 | ], 18 | "Resource": "*" 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/elasticache-node-primary-node-reboot-ssm-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "elasticache:DescribeCacheClusters", 8 | "elasticache:DescribeReplicationGroups", 9 | "elasticache:ListTagsForResource", 10 | "elasticache:RebootCacheCluster" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": [ 17 | "sts:GetCallerIdentity" 18 | ], 19 | "Resource": "*" 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /sqs-queue-impairment/sqs-queue-impairment-tag-based-ssm-automation-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "sqs:ListQueues", 8 | "sqs:ListQueueTags", 9 | "sqs:AddPermission", 10 | "sqs:RemovePermission", 11 | "sqs:GetQueueAttributes", 12 | "sqs:SetQueueAttributes" 13 | ], 14 | "Resource": "arn:aws:sqs:::*", 15 | "Condition": { 16 | "StringEquals": { 17 | "aws:ResourceTag/FIS-Ready": "True" 18 | } 19 | } 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /cloudfront-impairment/cloudfront-impairment-tag-based-ssm-automation-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "cloudfront:ListDistributions", 8 | "cloudfront:ListTagsForResource", 9 | "cloudfront:GetDistributionConfig", 10 | "cloudfront:CreateInvalidation" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": [ 17 | "s3:GetBucketPolicy", 18 | "s3:PutBucketPolicy", 19 | "s3:DeleteBucketPolicy" 20 | ], 21 | "Resource": "arn:aws:s3:::*" 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/dynamodb-region-impairment-ssm-automation-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "dynamodb:PutResourcePolicy", 8 | "dynamodb:DeleteResourcePolicy", 9 | "dynamodb:GetResourcePolicy", 10 | "dynamodb:DescribeTable" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": [ 17 | "sts:GetCallerIdentity" 18 | ], 19 | "Resource": "*" 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/redis-connection-failure-ssm-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "elasticache:DescribeCacheClusters", 8 | "elasticache:ListTagsForResource" 9 | ], 10 | "Resource": "*" 11 | }, 12 | { 13 | "Effect": "Allow", 14 | "Action": [ 15 | "ec2:DescribeSecurityGroups", 16 | "ec2:AuthorizeSecurityGroupIngress", 17 | "ec2:RevokeSecurityGroupIngress" 18 | ], 19 | "Resource": "*" 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "sts:GetCallerIdentity" 25 | ], 26 | "Resource": "*" 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /templates/example-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents", 10 | "logs:DescribeLogGroups", 11 | "logs:DescribeLogStreams" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Effect": "Allow", 17 | "Action": [ 18 | "fis:StartExperiment", 19 | "fis:GetExperimentSummary", 20 | "fis:GetExperimentResults", 21 | "fis:StopExperiment" 22 | ], 23 | "Resource": "*" 24 | } 25 | ] 26 | } -------------------------------------------------------------------------------- /aurora-global-region-failover/aurora-global-region-failover-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "ssm:StartAutomationExecution", 8 | "ssm:GetAutomationExecution", 9 | "ssm:DescribeAutomationExecutions", 10 | "ssm:DescribeAutomationStepExecutions", 11 | "ssm:StopAutomationExecution" 12 | ], 13 | "Resource": [ 14 | "arn:aws:ssm:*:*:document/*", 15 | "arn:aws:ssm:*:*:automation-execution/*" 16 | ] 17 | }, 18 | { 19 | "Effect": "Allow", 20 | "Action": [ 21 | "iam:PassRole" 22 | ], 23 | "Resource": "arn:aws:iam::*:role/*SSM*" 24 | } 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/fis-role-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "dynamodb:PauseReplication", 8 | "dynamodb:ResumeReplication", 9 | "dynamodb:DescribeTable", 10 | "dynamodb:ListTagsOfResource" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": [ 17 | "ssm:StartAutomationExecution", 18 | "ssm:GetAutomationExecution", 19 | "ssm:StopAutomationExecution" 20 | ], 21 | "Resource": "*" 22 | }, 23 | { 24 | "Effect": "Allow", 25 | "Action": [ 26 | "iam:PassRole" 27 | ], 28 | "Resource": "arn:aws:iam::*:role/*SSM*" 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | -------------------------------------------------------------------------------- /ec2-spot-interruption/ec2-spot-interruption-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "interrupt-ec2-spot", 3 | "targets": { 4 | "spot-instances": { 5 | "resourceType": "aws:ec2:spot-instance", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | } 11 | }, 12 | "actions": { 13 | "interrupt-ec2-spot": { 14 | "actionId": "aws:ec2:send-spot-instance-interruptions", 15 | "parameters": { 16 | "durationBeforeInterruption": "PT2M" 17 | }, 18 | "targets": { 19 | "SpotInstances": "spot-instances" 20 | } 21 | } 22 | }, 23 | "stopConditions": [ 24 | { 25 | "source": "none" 26 | } 27 | ], 28 | "roleArn": "arn:aws:iam:::role/", 29 | "tags": { 30 | "Name": "interrupt-ec2-spot" 31 | }, 32 | "experimentOptions": { 33 | "accountTargeting": "single-account", 34 | "emptyTargetResolutionMode": "fail" 35 | } 36 | } -------------------------------------------------------------------------------- /ec2-instances-terminate/ec2-instances-terminate-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "fis:StartExperiment", 8 | "fis:GetExperimentSummary", 9 | "fis:GetExperimentResults", 10 | "fis:StopExperiment" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": "ec2:TerminateInstances", 17 | "Resource": "arn:aws:ec2:*:*:instance/*", 18 | "Condition": { 19 | "StringEquals": { 20 | "aws:ResourceTag/FIS-Ready": "True" 21 | } 22 | } 23 | }, 24 | { 25 | "Effect": "Allow", 26 | "Action": [ 27 | "logs:CreateLogGroup", 28 | "logs:CreateLogStream", 29 | "logs:PutLogEvents", 30 | "logs:DescribeLogGroups", 31 | "logs:DescribeLogStreams" 32 | ], 33 | "Resource": "*" 34 | } 35 | ] 36 | } -------------------------------------------------------------------------------- /aurora-global-region-failover/aurora-global-region-failover-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Aurora Global Database regional failover experiment using SSM automation to test disaster recovery procedures and measure RTO/RPO", 3 | "targets": {}, 4 | "actions": { 5 | "aurora-global-failover": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "parameters": { 8 | "documentArn": "arn:aws:ssm:::document/aurora-global-region-failover-automation", 9 | "documentParameters": "{\"globalClusterIdentifier\": \"\", \"failoverType\": \"switchover\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}", 10 | "maxDuration": "PT10M" 11 | } 12 | } 13 | }, 14 | "stopConditions": [ 15 | { 16 | "source": "none" 17 | } 18 | ], 19 | "roleArn": "arn:aws:iam:::role/", 20 | "tags": { 21 | "Name": "aurora-global-region-failover" 22 | }, 23 | "experimentOptions": { 24 | "accountTargeting": "single-account", 25 | "emptyTargetResolutionMode": "fail" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/dynamodb-region-impairment-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "dynamodb:PauseReplication", 8 | "dynamodb:ResumeReplication", 9 | "dynamodb:PutResourcePolicy", 10 | "dynamodb:DeleteResourcePolicy", 11 | "dynamodb:GetResourcePolicy", 12 | "dynamodb:DescribeTable", 13 | "dynamodb:ListTagsOfResource", 14 | "dynamodb:ListTables", 15 | "tag:GetResources" 16 | ], 17 | "Resource": "*" 18 | }, 19 | { 20 | "Effect": "Allow", 21 | "Action": [ 22 | "ssm:StartAutomationExecution", 23 | "ssm:GetAutomationExecution", 24 | "ssm:StopAutomationExecution" 25 | ], 26 | "Resource": "*" 27 | }, 28 | { 29 | "Effect": "Allow", 30 | "Action": [ 31 | "iam:PassRole" 32 | ], 33 | "Resource": "arn:aws:iam::*:role/*SSM*" 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /ec2-instances-terminate/ec2-instances-terminate-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "ec2-instance-terminate", 3 | "targets": { 4 | "Instances-Target-1": { 5 | "resourceType": "aws:ec2:instance", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "PERCENT(25)" 10 | } 11 | }, 12 | "actions": { 13 | "ec2-instances-terminate": { 14 | "actionId": "aws:ec2:terminate-instances", 15 | "parameters": {}, 16 | "targets": { 17 | "Instances": "Instances-Target-1" 18 | } 19 | } 20 | }, 21 | "stopConditions": [ 22 | { 23 | "source": "none" 24 | } 25 | ], 26 | "roleArn": "arn:aws:iam:::role/", 27 | "tags": {}, 28 | "experimentOptions": { 29 | "accountTargeting": "single-account", 30 | "emptyTargetResolutionMode": "fail" 31 | } 32 | } -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/redis-connection-failure-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Simulate Redis connection failure to test client circuit breaker behavior", 3 | "targets": {}, 4 | "actions": { 5 | "disableRedisConnections": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "description": "Disable Redis connections for 5 minutes to test resilience", 8 | "parameters": { 9 | "maxDuration": "PT30M", 10 | "documentArn": "arn:aws:ssm:::document/", 11 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT5M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 12 | }, 13 | "targets": {} 14 | } 15 | }, 16 | "stopConditions": [ 17 | { 18 | "source": "none" 19 | } 20 | ], 21 | "roleArn": "arn:aws:iam:::role/", 22 | "tags": { 23 | "Name": "RedisConnectionFailureTest", 24 | "Purpose": "resilience-testing" 25 | }, 26 | "experimentOptions": { 27 | "accountTargeting": "single-account", 28 | "emptyTargetResolutionMode": "skip" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /cloudfront-impairment/cloudfront-impairment-tag-based-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowFISExperimentLogging", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogDelivery", 9 | "logs:PutResourcePolicy", 10 | "logs:DescribeResourcePolicies", 11 | "logs:DescribeLogGroups" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Sid": "AllowSSMDocumentExecution", 17 | "Effect": "Allow", 18 | "Action": [ 19 | "ssm:StartAutomationExecution", 20 | "ssm:GetAutomationExecution", 21 | "ssm:StopAutomationExecution" 22 | ], 23 | "Resource": [ 24 | "arn:aws:ssm:::document/", 25 | "arn:aws:ssm:::automation-definition/:*", 26 | "arn:aws:ssm:::automation-execution/*" 27 | ] 28 | }, 29 | { 30 | "Sid": "AllowPassRole", 31 | "Effect": "Allow", 32 | "Action": ["iam:PassRole"], 33 | "Resource": ["arn:aws:iam:::role/"] 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /sqs-queue-impairment/sqs-queue-impairment-tag-based-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowFISExperimentLogging", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogDelivery", 9 | "logs:PutResourcePolicy", 10 | "logs:DescribeResourcePolicies", 11 | "logs:DescribeLogGroups" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Sid": "AllowSSMDocumentExecution", 17 | "Effect": "Allow", 18 | "Action": [ 19 | "ssm:StartAutomationExecution", 20 | "ssm:GetAutomationExecution", 21 | "ssm:StopAutomationExecution" 22 | ], 23 | "Resource": [ 24 | "arn:aws:ssm:::document/", 25 | "arn:aws:ssm:::automation-definition/S:*", 26 | "arn:aws:ssm:::automation-execution/*" 27 | ] 28 | }, 29 | { 30 | "Sid": "AllowPassRole", 31 | "Effect": "Allow", 32 | "Action": ["iam:PassRole"], 33 | "Resource": ["arn:aws:iam:::role/"] 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /cloudfront-impairment/cloudfront-impairment-tag-based-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Simulate CloudFront distribution impairment with specific tag by disabling for 10 minutes", 3 | "targets": {}, 4 | "actions": { 5 | "impairCloudFront": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "description": "Simulate CloudFront distribution impairment by disabling for 10 minutes", 8 | "parameters": { 9 | "maxDuration": "PT1H", 10 | "documentArn": "arn:aws:ssm:::document/", 11 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT10M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 12 | }, 13 | "targets": {} 14 | } 15 | }, 16 | "stopConditions": [ 17 | { 18 | "source": "none" 19 | } 20 | ], 21 | "roleArn": "arn:aws:iam:::role/", 22 | "tags": { 23 | "Name": "SimulateCloudFrontImpairment", 24 | "Purpose": "resilience-testing" 25 | }, 26 | "experimentOptions": { 27 | "accountTargeting": "single-account", 28 | "emptyTargetResolutionMode": "skip" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/redis-connection-failure-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowFISExperimentLogging", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogDelivery", 9 | "logs:PutResourcePolicy", 10 | "logs:DescribeResourcePolicies", 11 | "logs:DescribeLogGroups" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Sid": "AllowSSMDocumentExecution", 17 | "Effect": "Allow", 18 | "Action": [ 19 | "ssm:StartAutomationExecution", 20 | "ssm:GetAutomationExecution", 21 | "ssm:StopAutomationExecution" 22 | ], 23 | "Resource": [ 24 | "arn:aws:ssm:::document/", 25 | "arn:aws:ssm:::automation-definition/:*", 26 | "arn:aws:ssm:::automation-execution/*" 27 | ] 28 | }, 29 | { 30 | "Sid": "AllowPassRole", 31 | "Effect": "Allow", 32 | "Action": ["iam:PassRole"], 33 | "Resource": ["arn:aws:iam:::role/"] 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Simulate ElastiCache Redis primary node reboot to test application resilience", 3 | "targets": {}, 4 | "actions": { 5 | "triggerPrimaryNodeReboot": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "description": "Reboot Redis primary node and monitor recovery", 8 | "parameters": { 9 | "maxDuration": "PT30M", 10 | "documentArn": "arn:aws:ssm:us-east-1::document/ElastiCache-Redis-Primary-Node-Reboot", 11 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"region\": \"us-east-1\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/ElastiCache-SSM-Automation-Role\"}" 12 | }, 13 | "targets": {} 14 | } 15 | }, 16 | "stopConditions": [ 17 | { 18 | "source": "none" 19 | } 20 | ], 21 | "roleArn": "arn:aws:iam:::role/ElastiCache-FIS-Role", 22 | "tags": { 23 | "Name": "ElastiCacheRedisPrimaryNodeRebootTest", 24 | "Purpose": "resilience-testing" 25 | }, 26 | "experimentOptions": { 27 | "accountTargeting": "single-account", 28 | "emptyTargetResolutionMode": "skip" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /aurora-cluster-failover/aurora-cluster-failover-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "aurora-cluster-failover", 3 | "targets": { 4 | "Clusters-Target-1": { 5 | "resourceType": "aws:rds:cluster", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL", 10 | "parameters": {} 11 | } 12 | }, 13 | "actions": { 14 | "failover-aurora-cluster": { 15 | "actionId": "aws:rds:failover-db-cluster", 16 | "parameters": {}, 17 | "targets": { 18 | "Clusters": "Clusters-Target-1" 19 | } 20 | } 21 | }, 22 | "stopConditions": [ 23 | { 24 | "source": "none" 25 | } 26 | ], 27 | "roleArn": "arn:aws:iam:::role/", 28 | "tags": {}, 29 | "experimentOptions": { 30 | "accountTargeting": "single-account", 31 | "emptyTargetResolutionMode": "fail" 32 | } 33 | } -------------------------------------------------------------------------------- /ec2-spot-interruption/ec2-spot-interruption-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "ec2:SendSpotInstanceInterruptions", 8 | "ec2:DescribeSpotInstanceRequests", 9 | "ec2:DescribeInstances" 10 | ], 11 | "Resource": "*", 12 | "Condition": { 13 | "StringEquals": { 14 | "aws:ResourceTag/FIS-Ready": "True" 15 | } 16 | } 17 | }, 18 | { 19 | "Effect": "Allow", 20 | "Action": [ 21 | "logs:CreateLogGroup", 22 | "logs:CreateLogStream", 23 | "logs:PutLogEvents", 24 | "logs:DescribeLogGroups", 25 | "logs:DescribeLogStreams" 26 | ], 27 | "Resource": "*" 28 | }, 29 | { 30 | "Effect": "Allow", 31 | "Action": [ 32 | "fis:StartExperiment", 33 | "fis:GetExperimentSummary", 34 | "fis:GetExperimentResults", 35 | "fis:StopExperiment" 36 | ], 37 | "Resource": "*" 38 | } 39 | ] 40 | } -------------------------------------------------------------------------------- /aurora-cluster-failover/aurora-cluster-failover-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "rds:FailoverDBCluster", 8 | "rds:DescribeDBClusters" 9 | ], 10 | "Resource": [ 11 | "arn:aws:rds:*:*:cluster:*" 12 | ], 13 | "Condition": { 14 | "StringEquals": { 15 | "aws:ResourceTag/FIS-Ready": "True" 16 | } 17 | } 18 | }, 19 | { 20 | "Effect": "Allow", 21 | "Action": [ 22 | "logs:CreateLogGroup", 23 | "logs:CreateLogStream", 24 | "logs:PutLogEvents", 25 | "logs:DescribeLogGroups", 26 | "logs:DescribeLogStreams" 27 | ], 28 | "Resource": "*" 29 | }, 30 | { 31 | "Effect": "Allow", 32 | "Action": [ 33 | "fis:StartExperiment", 34 | "fis:GetExperimentSummary", 35 | "fis:GetExperimentResults", 36 | "fis:StopExperiment" 37 | ], 38 | "Resource": "*" 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /ec2-windows-stop-iis/ec2-windows-stop-iis-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "This Experiment Stops IIS on target Windows Instances", 3 | "targets": { 4 | "IISServers": { 5 | "resourceType": "aws:ec2:instance", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | } 11 | }, 12 | "actions": { 13 | "StopIIS": { 14 | "actionId": "aws:ssm:send-command", 15 | "parameters": { 16 | "documentArn": "arn:aws:ssm:::document/StopIISAppPool", 17 | "documentParameters": "{\"DurationSeconds\": \"285\", \"IISAppPoolName\": \"DefaultAppPool\"}", 18 | "duration": "PT5M" 19 | }, 20 | "targets": { 21 | "Instances": "IISServers" 22 | } 23 | } 24 | }, 25 | "stopConditions": [ 26 | { 27 | "source": "none" 28 | } 29 | ], 30 | "roleArn": "arn:aws:iam:::role/", 31 | "tags": { 32 | "Name": "StopIISExperiment" 33 | }, 34 | "experimentOptions": { 35 | "accountTargeting": "single-account", 36 | "emptyTargetResolutionMode": "fail" 37 | } 38 | } -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Force ElastiCache Redis primary node failover to test automatic failover mechanisms", 3 | "targets": {}, 4 | "actions": { 5 | "triggerPrimaryFailover": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "description": "Force Redis primary node failover and monitor completion", 8 | "parameters": { 9 | "maxDuration": "PT30M", 10 | "documentArn": "arn:aws:ssm:{{ aws:region }}::document/ElastiCache-Redis-Primary-Node-Failover", 11 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"region\": \"{{ aws:region }}\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/ElastiCache-Failover-SSM-Role\"}" 12 | }, 13 | "targets": {} 14 | } 15 | }, 16 | "stopConditions": [ 17 | { 18 | "source": "none" 19 | } 20 | ], 21 | "roleArn": "arn:aws:iam:::role/ElastiCache-Failover-FIS-Role", 22 | "tags": { 23 | "Name": "ElastiCacheRedisPrimaryNodeFailoverTest", 24 | "Purpose": "resilience-testing" 25 | }, 26 | "experimentOptions": { 27 | "accountTargeting": "single-account", 28 | "emptyTargetResolutionMode": "skip" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ec2-windows-stop-iis/ec2-windows-stop-iis-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "fis:StartExperiment", 8 | "fis:GetExperimentSummary", 9 | "fis:GetExperimentResults", 10 | "fis:StopExperiment" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect":"Allow", 16 | "Action":[ 17 | "ssm:SendCommand" 18 | ], 19 | "Resource":[ 20 | "arn:aws:ssm:*:*:document/*" 21 | ] 22 | }, 23 | { 24 | "Effect": "Allow", 25 | "Action": [ 26 | "ssm:SendCommand" 27 | ], 28 | "Resource": [ 29 | "arn:aws:ec2:*:*:instance/*" 30 | ], 31 | "Condition": { 32 | "StringEquals": { 33 | "aws:ResourceTag/FIS-Ready": "True" 34 | } 35 | } 36 | }, 37 | { 38 | "Effect": "Allow", 39 | "Action": [ 40 | "logs:CreateLogGroup", 41 | "logs:CreateLogStream", 42 | "logs:PutLogEvents", 43 | "logs:DescribeLogGroups", 44 | "logs:DescribeLogStreams" 45 | ], 46 | "Resource": "*" 47 | } 48 | ] 49 | } -------------------------------------------------------------------------------- /sap-ebs-pause-database-data/sap-ebs-pause-database-data-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "ec2:DescribeVolumes" 8 | ], 9 | "Resource": "*" 10 | }, 11 | { 12 | "Effect": "Allow", 13 | "Action": [ 14 | "ec2:PauseVolumeIO" 15 | ], 16 | "Resource": "*" 17 | }, 18 | { 19 | "Effect": "Allow", 20 | "Action": [ 21 | "logs:CreateLogGroup", 22 | "logs:CreateLogStream", 23 | "logs:PutLogEvents", 24 | "logs:DescribeLogGroups", 25 | "logs:DescribeLogStreams" 26 | ], 27 | "Resource": "*" 28 | }, 29 | { 30 | "Effect": "Allow", 31 | "Action": [ 32 | "fis:StartExperiment", 33 | "fis:GetExperimentSummary", 34 | "fis:GetExperimentResults", 35 | "fis:StopExperiment" 36 | ], 37 | "Resource": "*" 38 | } 39 | ] 40 | } -------------------------------------------------------------------------------- /dynamodb-traffic-blackhole-region-impairment/dynamodb-traffic-blackhole-region-impairment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Block DynamoDB traffic at the network level and pause global table replication to simulate complete regional DynamoDB failure", 3 | "targets": { 4 | "subnets": { 5 | "resourceType": "aws:ec2:subnet", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | }, 11 | "dynamodbTables": { 12 | "resourceType": "aws:dynamodb:global-table", 13 | "resourceTags": { 14 | "FIS-Ready": "True" 15 | }, 16 | "selectionMode": "ALL" 17 | } 18 | }, 19 | "actions": { 20 | "blockDynamoDBTraffic": { 21 | "actionId": "aws:network:disrupt-connectivity", 22 | "description": "Block DynamoDB traffic from target subnets", 23 | "parameters": { 24 | "scope": "dynamodb", 25 | "duration": "PT10M" 26 | }, 27 | "targets": { 28 | "Subnets": "subnets" 29 | } 30 | }, 31 | "pauseReplication": { 32 | "actionId": "aws:dynamodb:global-table-pause-replication", 33 | "description": "Pause DynamoDB global table replication", 34 | "parameters": { 35 | "duration": "PT10M" 36 | }, 37 | "targets": { 38 | "Tables": "dynamodbTables" 39 | } 40 | } 41 | }, 42 | "stopConditions": [ 43 | { 44 | "source": "none" 45 | } 46 | ], 47 | "roleArn": "arn:aws:iam:::role/FIS-DynamoDB-Traffic-Blackhole-Role", 48 | "experimentOptions": { 49 | "emptyTargetResolutionMode": "skip" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /dynamodb-traffic-blackhole-region-impairment/dynamodb-traffic-blackhole-region-impairment-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "ec2:CreateNetworkAcl", 8 | "ec2:CreateNetworkAclEntry", 9 | "ec2:CreateTags", 10 | "ec2:DeleteNetworkAcl", 11 | "ec2:DescribeNetworkAcls", 12 | "ec2:DescribeSubnets", 13 | "ec2:DescribeVpcs", 14 | "ec2:ReplaceNetworkAclAssociation" 15 | ], 16 | "Resource": "*" 17 | }, 18 | { 19 | "Effect": "Allow", 20 | "Action": [ 21 | "ec2:CreateNetworkAclEntry" 22 | ], 23 | "Resource": "*", 24 | "Condition": { 25 | "StringEquals": { 26 | "ec2:ResourceTag/managedByFIS": "true" 27 | } 28 | } 29 | }, 30 | { 31 | "Effect": "Allow", 32 | "Action": [ 33 | "ec2:DeleteNetworkAcl" 34 | ], 35 | "Resource": "*", 36 | "Condition": { 37 | "StringEquals": { 38 | "ec2:ResourceTag/managedByFIS": "true" 39 | } 40 | } 41 | }, 42 | { 43 | "Effect": "Allow", 44 | "Action": [ 45 | "dynamodb:PauseReplication", 46 | "dynamodb:ResumeReplication", 47 | "dynamodb:PutResourcePolicy", 48 | "dynamodb:DeleteResourcePolicy", 49 | "dynamodb:GetResourcePolicy", 50 | "dynamodb:DescribeTable", 51 | "dynamodb:ListTagsOfResource", 52 | "dynamodb:ListTables", 53 | "tag:GetResources" 54 | ], 55 | "Resource": "*" 56 | } 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/dynamodb-region-impairment-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Block DynamoDB access in a specific region to test global table failover and application resilience", 3 | "targets": { 4 | "dynamodbTables": { 5 | "resourceType": "aws:dynamodb:global-table", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | } 11 | }, 12 | "actions": { 13 | "pauseReplication": { 14 | "actionId": "aws:dynamodb:global-table-pause-replication", 15 | "description": "Pause DynamoDB global table replication in ", 16 | "parameters": { 17 | "duration": "PT12M" 18 | }, 19 | "targets": { 20 | "Tables": "dynamodbTables" 21 | } 22 | }, 23 | "blockDynamoDBAccess": { 24 | "actionId": "aws:ssm:start-automation-execution", 25 | "description": "Block DynamoDB access in target region for 10 minutes", 26 | "parameters": { 27 | "maxDuration": "PT15M", 28 | "documentArn": "arn:aws:ssm:::document/DynamoDB-Region-Impairment", 29 | "documentParameters": "{\"tableName\": \"\", \"targetRegion\": \"\", \"duration\": \"PT10M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/SSM-DynamoDB-Automation-Role\"}" 30 | }, 31 | "targets": {} 32 | } 33 | }, 34 | "stopConditions": [ 35 | { 36 | "source": "none" 37 | } 38 | ], 39 | "roleArn": "arn:aws:iam:::role/FIS-DynamoDB-Role", 40 | "experimentOptions": { 41 | "emptyTargetResolutionMode": "skip" 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowFISExperimentLogging", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogDelivery", 9 | "logs:PutResourcePolicy", 10 | "logs:DescribeResourcePolicies", 11 | "logs:DescribeLogGroups" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Sid": "AllowSSMDocumentExecution", 17 | "Effect": "Allow", 18 | "Action": [ 19 | "ssm:StartAutomationExecution", 20 | "ssm:GetAutomationExecution", 21 | "ssm:StopAutomationExecution" 22 | ], 23 | "Resource": [ 24 | "arn:aws:ssm:*:*:document/ElastiCache-Redis-Primary-Node-Reboot", 25 | "arn:aws:ssm:*:*:automation-definition/ElastiCache-Redis-Primary-Node-Reboot:*", 26 | "arn:aws:ssm:*:*:automation-execution/*" 27 | ] 28 | }, 29 | { 30 | "Sid": "AllowElastiCacheOperations", 31 | "Effect": "Allow", 32 | "Action": [ 33 | "elasticache:DescribeReplicationGroups", 34 | "elasticache:ListTagsForResource", 35 | "elasticache:RebootCacheCluster" 36 | ], 37 | "Resource": "*" 38 | }, 39 | { 40 | "Sid": "AllowSTSOperations", 41 | "Effect": "Allow", 42 | "Action": [ 43 | "sts:GetCallerIdentity" 44 | ], 45 | "Resource": "*" 46 | }, 47 | { 48 | "Sid": "AllowPassRole", 49 | "Effect": "Allow", 50 | "Action": ["iam:PassRole"], 51 | "Resource": ["arn:aws:iam::*:role/*"] 52 | } 53 | ] 54 | } 55 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-fis-role-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowFISExperimentLogging", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogDelivery", 9 | "logs:PutResourcePolicy", 10 | "logs:DescribeResourcePolicies", 11 | "logs:DescribeLogGroups" 12 | ], 13 | "Resource": "*" 14 | }, 15 | { 16 | "Sid": "AllowSSMDocumentExecution", 17 | "Effect": "Allow", 18 | "Action": [ 19 | "ssm:StartAutomationExecution", 20 | "ssm:GetAutomationExecution", 21 | "ssm:StopAutomationExecution" 22 | ], 23 | "Resource": [ 24 | "arn:aws:ssm:*:*:document/ElastiCache-Redis-Primary-Node-Failover", 25 | "arn:aws:ssm:*:*:automation-definition/ElastiCache-Redis-Primary-Node-Failover:*", 26 | "arn:aws:ssm:*:*:automation-execution/*" 27 | ] 28 | }, 29 | { 30 | "Sid": "AllowElastiCacheOperations", 31 | "Effect": "Allow", 32 | "Action": [ 33 | "elasticache:DescribeReplicationGroups", 34 | "elasticache:ListTagsForResource", 35 | "elasticache:ModifyReplicationGroup" 36 | ], 37 | "Resource": "*" 38 | }, 39 | { 40 | "Sid": "AllowSTSOperations", 41 | "Effect": "Allow", 42 | "Action": [ 43 | "sts:GetCallerIdentity" 44 | ], 45 | "Resource": "*" 46 | }, 47 | { 48 | "Sid": "AllowPassRole", 49 | "Effect": "Allow", 50 | "Action": ["iam:PassRole"], 51 | "Resource": ["arn:aws:iam::*:role/*"] 52 | } 53 | ] 54 | } 55 | -------------------------------------------------------------------------------- /sap-ec2-instance-stop-ascs/sap-ec2-instance-stop-sap-ascs-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "EC2 Stop - SAP ASCS ERS Cluster", 3 | "targets": { 4 | "SAPASCSERSCLUSTER": { 5 | "resourceType": "aws:ec2:instance", 6 | "resourceTags": { 7 | "FIS-Application": "SAP", 8 | "FIS-Ready": "True", 9 | "FIS-SAP-App-Tier": "Application", 10 | "FIS-SAP-Environment-Type": "Dev", 11 | "FIS-SAP-HA-Node": "Primary", 12 | "FIS-SAP-SID": "S4" 13 | }, 14 | "selectionMode": "ALL" 15 | } 16 | }, 17 | "actions": { 18 | "EC2STOP": { 19 | "actionId": "aws:ec2:stop-instances", 20 | "description": "Stop SAP ASCS Node", 21 | "parameters": { 22 | "startInstancesAfterDuration": "PT5M" 23 | }, 24 | "targets": { 25 | "Instances": "SAPASCSERSCLUSTER" 26 | } 27 | } 28 | }, 29 | "stopConditions": [ 30 | { 31 | "source": "none" 32 | } 33 | ], 34 | "roleArn": "arn:aws:iam:::role/", 35 | "tags": { 36 | "Name": "EC2 Stop - SAP ASCS ERS Cluster" 37 | }, 38 | "experimentOptions": { 39 | "accountTargeting": "single-account", 40 | "emptyTargetResolutionMode": "fail" 41 | } 42 | } -------------------------------------------------------------------------------- /sap-ec2-instance-stop-database/sap-ec2-instance-stop-sap-database-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "EC2 Stop - SAP Database Cluster", 3 | "targets": { 4 | "SAPDBCLUSTERNODE": { 5 | "resourceType": "aws:ec2:instance", 6 | "resourceTags": { 7 | "FIS-Application": "SAP", 8 | "FIS-Ready": "True", 9 | "FIS-SAP-App-Tier": "Database", 10 | "FIS-SAP-Environment-Type": "Dev", 11 | "FIS-SAP-HA-Node": "Primary", 12 | "FIS-SAP-SID": "S4" 13 | }, 14 | "selectionMode": "ALL" 15 | } 16 | }, 17 | "actions": { 18 | "EC2STOP": { 19 | "actionId": "aws:ec2:stop-instances", 20 | "description": "EC2 Stop - SAP Database Cluster", 21 | "parameters": { 22 | "startInstancesAfterDuration": "PT5M" 23 | }, 24 | "targets": { 25 | "Instances": "SAPDBCLUSTERNODE" 26 | } 27 | } 28 | }, 29 | "stopConditions": [ 30 | { 31 | "source": "none" 32 | } 33 | ], 34 | "roleArn": "arn:aws:iam:::role/", 35 | "tags": { 36 | "Name": "EC2 Stop - SAP Database Cluster" 37 | }, 38 | "experimentOptions": { 39 | "accountTargeting": "single-account", 40 | "emptyTargetResolutionMode": "fail" 41 | } 42 | } -------------------------------------------------------------------------------- /sap-ebs-pause-database-data/sap-ebs-pause-database-data-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "EBS Pause - SAP Database Cluster", 3 | "targets": { 4 | "EBSPAUSEDB": { 5 | "resourceType": "aws:ec2:ebs-volume", 6 | "resourceTags": { 7 | "FIS-Application": "SAP", 8 | "FIS-Ready": "True", 9 | "FIS-SAP-App-Tier": "Database", 10 | "FIS-SAP-Database-Type": "Data", 11 | "FIS-SAP-Environment-Type": "Dev", 12 | "FIS-SAP-SID": "S4" 13 | }, 14 | "selectionMode": "ALL", 15 | "parameters": { 16 | "availabilityZoneIdentifier": "us-east-1a" 17 | } 18 | } 19 | }, 20 | "actions": { 21 | "EBSPAUSEDB": { 22 | "actionId": "aws:ebs:pause-volume-io", 23 | "description": "EBS Pause - SAP DB", 24 | "parameters": { 25 | "duration": "PT5M" 26 | }, 27 | "targets": { 28 | "Volumes": "EBSPAUSEDB" 29 | } 30 | } 31 | }, 32 | "stopConditions": [ 33 | { 34 | "source": "none" 35 | } 36 | ], 37 | "roleArn": "arn:aws:iam:::role/", 38 | "tags": { 39 | "Name": "EBS Pause - SAP Database Cluster" 40 | }, 41 | "experimentOptions": { 42 | "accountTargeting": "single-account", 43 | "emptyTargetResolutionMode": "fail" 44 | } 45 | } -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "fis:InjectApiInternalError", 8 | "fis:InjectApiThrottleError", 9 | "fis:InjectApiUnavailableError" 10 | ], 11 | "Resource": "*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "rds:FailoverDBCluster", 17 | "rds:DescribeDBClusters", 18 | "rds:DescribeDBInstances" 19 | ], 20 | "Resource": "*", 21 | "Condition": { 22 | "StringEquals": { 23 | "aws:ResourceTag/FIS-Ready": "True" 24 | } 25 | } 26 | }, 27 | { 28 | "Effect": "Allow", 29 | "Action": [ 30 | "ssm:SendCommand", 31 | "ssm:ListCommandInvocations", 32 | "ssm:DescribeInstanceInformation", 33 | "ssm:GetCommandInvocation", 34 | "ssm:DescribeDocumentParameters" 35 | ], 36 | "Resource": [ 37 | "arn:aws:ssm:*:*:document/aurora-cluster-loadtest-document", 38 | "arn:aws:ssm:*:*:document/AWS-RunShellScript" 39 | ] 40 | }, 41 | { 42 | "Effect": "Allow", 43 | "Action": [ 44 | "ssm:SendCommand" 45 | ], 46 | "Resource": "arn:aws:ec2:*:*:instance/*", 47 | "Condition": { 48 | "StringEquals": { 49 | "aws:ResourceTag/FIS-Ready": "True" 50 | } 51 | } 52 | }, 53 | { 54 | "Effect": "Allow", 55 | "Action": [ 56 | "ec2:DescribeInstances" 57 | ], 58 | "Resource": "*" 59 | }, 60 | { 61 | "Effect": "Allow", 62 | "Action": [ 63 | "logs:CreateLogDelivery", 64 | "logs:PutResourcePolicy", 65 | "logs:DescribeResourcePolicies", 66 | "logs:DescribeLogGroups" 67 | ], 68 | "Resource": "*" 69 | } 70 | ] 71 | } 72 | -------------------------------------------------------------------------------- /sap-ec2-instance-stop-ascs/sap-ec2-instance-stop-sap-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "AllowEc2Actions", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "ec2:RebootInstances", 9 | "ec2:StartInstances", 10 | "ec2:StopInstances" 11 | ], 12 | "Resource": "arn:aws:ec2:*:*:instance/*" 13 | }, 14 | { 15 | "Sid": "AllowEc2InstancesWithEncryptedEbsVolumes", 16 | "Effect": "Allow", 17 | "Action": [ 18 | "kms:CreateGrant" 19 | ], 20 | "Resource": [ 21 | "arn:aws:kms:*:*:key/*" 22 | ], 23 | "Condition": { 24 | "StringLike": { 25 | "kms:ViaService": "ec2.*.amazonaws.com" 26 | }, 27 | "Bool": { 28 | "kms:GrantIsForAWSResource": "true" 29 | } 30 | } 31 | }, 32 | { 33 | "Sid": "AllowSSMSendOnEc2", 34 | "Effect": "Allow", 35 | "Action": [ 36 | "ssm:SendCommand" 37 | ], 38 | "Resource": [ 39 | "arn:aws:ec2:*:*:instance/*", 40 | "arn:aws:ssm:*:*:document/*" 41 | ] 42 | }, 43 | { 44 | "Sid": "AllowSSMStopOnEc2", 45 | "Effect": "Allow", 46 | "Action": [ 47 | "ssm:CancelCommand", 48 | "ssm:ListCommands" 49 | ], 50 | "Resource": "*" 51 | }, 52 | { 53 | "Sid": "DescribeInstances", 54 | "Effect": "Allow", 55 | "Action": "ec2:DescribeInstances", 56 | "Resource": "*" 57 | }, 58 | { 59 | "Effect": "Allow", 60 | "Action": [ 61 | "logs:CreateLogGroup", 62 | "logs:CreateLogStream", 63 | "logs:PutLogEvents", 64 | "logs:DescribeLogGroups", 65 | "logs:DescribeLogStreams" 66 | ], 67 | "Resource": "*" 68 | }, 69 | { 70 | "Effect": "Allow", 71 | "Action": [ 72 | "fis:StartExperiment", 73 | "fis:GetExperimentSummary", 74 | "fis:GetExperimentResults", 75 | "fis:StopExperiment" 76 | ], 77 | "Resource": "*" 78 | } 79 | ] 80 | } -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Aurora cluster CPU overload and failover experiment", 3 | "targets": { 4 | "AuroraCluster": { 5 | "resourceType": "aws:rds:cluster", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | }, 11 | "EC2Instance": { 12 | "resourceType": "aws:ec2:instance", 13 | "resourceTags": { 14 | "FIS-Ready": "True" 15 | }, 16 | "selectionMode": "ALL" 17 | } 18 | }, 19 | "actions": { 20 | "DelayAction": { 21 | "actionId": "aws:fis:wait", 22 | "description": "Wait 5 minutes to establish baseline metrics", 23 | "parameters": { 24 | "duration": "PT5M" 25 | } 26 | }, 27 | "RunLoadTest": { 28 | "actionId": "aws:ssm:send-command", 29 | "description": "Execute CPU load test on Aurora cluster", 30 | "parameters": { 31 | "documentArn": "arn:aws:ssm:::document/aurora-cluster-loadtest-document", 32 | "documentParameters": "{\"Duration\":\"600\",\"Concurrency\":\"10\"}", 33 | "duration": "PT10M" 34 | }, 35 | "targets": { 36 | "Instances": "EC2Instance" 37 | } 38 | }, 39 | "FailoverCluster": { 40 | "actionId": "aws:rds:failover-db-cluster", 41 | "description": "Initiate Aurora cluster failover", 42 | "parameters": {}, 43 | "targets": { 44 | "Clusters": "AuroraCluster" 45 | }, 46 | "startAfter": ["DelayAction"] 47 | } 48 | }, 49 | "stopConditions": [ 50 | { 51 | "source": "none" 52 | } 53 | ], 54 | "roleArn": "arn:aws:iam:::role/FISExperimentRole", 55 | "tags": { 56 | "Name": "Aurora-Cluster-CPU-Overload-Failover" 57 | }, 58 | "logConfiguration": { 59 | "logSchemaVersion": 2, 60 | "cloudWatchLogsConfiguration": { 61 | "logGroupArn": "arn:aws:logs:::log-group:FISExperimentLogs" 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /templates/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Spot Instances Interrupt 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | 10 | ## Description 11 | 12 | ## Hypothesis 13 | 14 | ## Prerequisites 15 | 16 | Before running this experiment, ensure that: 17 | 18 | 1. 19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 20 | 3. The you want to target have the tag. 21 | 22 | ## How it works 23 | 24 | 25 | ## Stop Conditions 26 | 27 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 28 | 29 | ## Observability and stop conditions 30 | 31 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 32 | business metric requiring an immediate end of the fault injection. This 33 | template makes no assumptions about your application and the relevant metrics 34 | and does not include stop conditions by default. 35 | 36 | ## Next Steps 37 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. -------------------------------------------------------------------------------- /sap-ec2-instance-stop-database/sap-ec2-instance-stop-sap-database-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "fis:StartExperiment", 8 | "fis:GetExperimentSummary", 9 | "fis:GetExperimentResults", 10 | "fis:StopExperiment" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": "ec2:StopInstances", 17 | "Resource": "arn:aws:ec2:*:*:instance/*", 18 | "Condition": { 19 | "StringEquals": { 20 | "ec2:ResourceTag/FIS-Application": "SAP", 21 | "ec2:ResourceTag/FIS-Ready": "True", 22 | "ec2:ResourceTag/FIS-SAP-App-Tier": "Database", 23 | "ec2:ResourceTag/FIS-SAP-Environment-Type": "Dev", 24 | "ec2:ResourceTag/FIS-SAP-HA-Node": "Primary", 25 | "ec2:ResourceTag/FIS-SAP-SID": "S4" 26 | } 27 | } 28 | }, 29 | { 30 | "Effect": "Allow", 31 | "Action": [ 32 | "logs:CreateLogGroup", 33 | "logs:CreateLogStream", 34 | "logs:PutLogEvents", 35 | "logs:DescribeLogGroups", 36 | "logs:DescribeLogStreams" 37 | ], 38 | "Resource": "*" 39 | }, 40 | { 41 | "Effect": "Allow", 42 | "Action": [ 43 | "logs:CreateLogGroup", 44 | "logs:CreateLogStream", 45 | "logs:PutLogEvents", 46 | "logs:DescribeLogGroups", 47 | "logs:DescribeLogStreams" 48 | ], 49 | "Resource": "*" 50 | }, 51 | { 52 | "Effect": "Allow", 53 | "Action": [ 54 | "fis:StartExperiment", 55 | "fis:GetExperimentSummary", 56 | "fis:GetExperimentResults", 57 | "fis:StopExperiment" 58 | ], 59 | "Resource": "*" 60 | } 61 | ] 62 | } 63 | -------------------------------------------------------------------------------- /ec2-instances-terminate/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Hypothesis 11 | 12 | Our application will remain available, as 25% of the EC2 servers within my autoscaling group are terminated. 13 | 14 | ## Prerequisites 15 | 16 | Before running this experiment, ensure that: 17 | 18 | 1. You have the necessary permissions to execute the FIS experiment and perform the termination of EC2 Spot Instance 19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 20 | 3. The EC2 Instance you want to target have the `FIS-Ready=True` tag. 21 | 22 | ## Stop Conditions 23 | 24 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 25 | 26 | ## Observability and stop conditions 27 | 28 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 29 | business metric requiring an immediate end of the fault injection. This 30 | template makes no assumptions about your application and the relevant metrics 31 | and does not include stop conditions by default. 32 | 33 | ## Next Steps 34 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 35 | 36 | ## Import Experiment 37 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 38 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-automation.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "0.3", 3 | "description": "Test ElastiCache Redis failover using tag-based discovery", 4 | "assumeRole": "arn:aws:iam::{{ global:ACCOUNT_ID }}:role/ElastiCache-SSM-Automation-Role", 5 | "parameters": { 6 | "tagKey": { 7 | "type": "String", 8 | "description": "Tag key to identify ElastiCache clusters to target", 9 | "default": "FIS-Ready" 10 | }, 11 | "tagValue": { 12 | "type": "String", 13 | "description": "Tag value to identify ElastiCache clusters to target", 14 | "default": "True" 15 | } 16 | }, 17 | "mainSteps": [ 18 | { 19 | "name": "triggerFailover", 20 | "action": "aws:executeScript", 21 | "inputs": { 22 | "Runtime": "python3.11", 23 | "Handler": "trigger_failover", 24 | "Script": "import boto3\ndef trigger_failover(events, context):\n region = events[\"region\"]\n tag_key = events[\"tagKey\"]\n tag_value = events[\"tagValue\"]\n elasticache = boto3.client(\"elasticache\", region_name=region)\n response = elasticache.describe_replication_groups()\n for rg in response[\"ReplicationGroups\"]:\n if rg[\"Status\"] == \"available\" and rg.get(\"AutomaticFailover\") == \"enabled\":\n rg_id = rg[\"ReplicationGroupId\"]\n try:\n account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n arn = \"arn:aws:elasticache:{}:{}:replicationgroup:{}\".format(region, account_id, rg_id)\n tags_response = elasticache.list_tags_for_resource(ResourceName=arn)\n tags = {tag[\"Key\"]: tag[\"Value\"] for tag in tags_response.get(\"TagList\", [])}\n if tags.get(tag_key) == tag_value:\n elasticache.test_failover(ReplicationGroupId=rg_id, NodeGroupId=\"0001\")\n return {\"ReplicationGroupId\": rg_id, \"Status\": \"Failover initiated\"}\n except Exception as e:\n continue\n raise Exception(\"No cluster found with tag {}={}\".format(tag_key, tag_value))", 25 | "InputPayload": { 26 | "region": "{{ global:REGION }}", 27 | "tagKey": "{{ tagKey }}", 28 | "tagValue": "{{ tagValue }}" 29 | } 30 | }, 31 | "outputs": [ 32 | { 33 | "Name": "Result", 34 | "Selector": "$.Payload", 35 | "Type": "StringMap" 36 | } 37 | ] 38 | } 39 | ] 40 | } 41 | -------------------------------------------------------------------------------- /mysql-rds-loadtest-failover/mysql-rds-loadtest-failover-iam-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "fis:StartExperiment", 8 | "fis:GetExperimentSummary", 9 | "fis:GetExperimentResults", 10 | "fis:StopExperiment" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Effect": "Allow", 16 | "Action": [ 17 | "rds:RebootDBInstance", 18 | "rds:DescribeDBInstances", 19 | "rds:DescribeDBClusters" 20 | ], 21 | "Resource": "*", 22 | "Condition": { 23 | "StringEquals": { 24 | "aws:ResourceTag/FIS-Ready": "True" 25 | } 26 | } 27 | }, 28 | { 29 | "Effect": "Allow", 30 | "Action": [ 31 | "ssm:SendCommand" 32 | ], 33 | "Resource": [ 34 | "arn:aws:ssm:*:*:document/*" 35 | ] 36 | }, 37 | { 38 | "Effect": "Allow", 39 | "Action": [ 40 | "ssm:SendCommand" 41 | ], 42 | "Resource": [ 43 | "arn:aws:ec2:*:*:instance/*" 44 | ], 45 | "Condition": { 46 | "StringEquals": { 47 | "aws:ResourceTag/FIS-Ready": "True" 48 | } 49 | } 50 | }, 51 | { 52 | "Effect": "Allow", 53 | "Action": [ 54 | "ssm:GetCommandInvocation", 55 | "ssm:ListCommands", 56 | "ssm:ListCommandInvocations", 57 | "ssm:DescribeInstanceInformation" 58 | ], 59 | "Resource": "*" 60 | }, 61 | { 62 | "Effect": "Allow", 63 | "Action": [ 64 | "ec2:DescribeInstances" 65 | ], 66 | "Resource": "*" 67 | }, 68 | { 69 | "Effect": "Allow", 70 | "Action": [ 71 | "logs:CreateLogGroup", 72 | "logs:CreateLogStream", 73 | "logs:PutLogEvents", 74 | "logs:DescribeLogGroups", 75 | "logs:DescribeLogStreams", 76 | "logs:CreateLogDelivery", 77 | "logs:DeleteLogDelivery", 78 | "logs:DescribeResourcePolicies", 79 | "logs:PutResourcePolicy" 80 | ], 81 | "Resource": "*" 82 | } 83 | ] 84 | } 85 | -------------------------------------------------------------------------------- /mysql-rds-loadtest-failover/mysql-rds-loadtest-failover-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "MySQL RDS Load Test and Failover Experiment", 3 | "targets": { 4 | "LoadTestInstances": { 5 | "resourceType": "aws:ec2:instance", 6 | "resourceTags": { 7 | "FIS-Ready": "True" 8 | }, 9 | "selectionMode": "ALL" 10 | }, 11 | "MySQLInstances": { 12 | "resourceType": "aws:rds:db", 13 | "resourceTags": { 14 | "FIS-Ready": "True" 15 | }, 16 | "selectionMode": "ALL" 17 | } 18 | }, 19 | "actions": { 20 | "RunLoadTest": { 21 | "actionId": "aws:ssm:send-command", 22 | "description": "Run MySQL high CPU load test until target CPU utilization is reached", 23 | "parameters": { 24 | "documentArn": "arn:aws:ssm:::document/MySQL-LoadTest-Document", 25 | "documentParameters": "{\"Duration\":\"600\",\"Concurrency\":\"25\",\"TargetCPU\":\"80\"}", 26 | "duration": "PT15M" 27 | }, 28 | "targets": { 29 | "Instances": "LoadTestInstances" 30 | } 31 | }, 32 | "ForceFailover": { 33 | "actionId": "aws:rds:reboot-db-instances", 34 | "description": "Force a failover by rebooting the primary instance with failover", 35 | "parameters": { 36 | "forceFailover": "true" 37 | }, 38 | "targets": { 39 | "DBInstances": "MySQLInstances" 40 | }, 41 | "startAfter": ["RunLoadTest"] 42 | }, 43 | "StopLoadTest": { 44 | "actionId": "aws:ssm:send-command", 45 | "description": "Stop the load test after failover completes", 46 | "parameters": { 47 | "documentArn": "arn:aws:ssm:::document/AWS-RunShellScript", 48 | "documentParameters": "{\"commands\":[\"pkill -f 'mysql_load_worker'\",\"echo \\\"Load test stopped\\\"\"]}", 49 | "duration": "PT1M" 50 | }, 51 | "targets": { 52 | "Instances": "LoadTestInstances" 53 | }, 54 | "startAfter": ["ForceFailover"], 55 | "startAfterDelay": "PT5M" 56 | } 57 | }, 58 | "stopConditions": [ 59 | { 60 | "source": "none" 61 | } 62 | ], 63 | "roleArn": "arn:aws:iam:::role/", 64 | "logConfiguration": { 65 | "logSchemaVersion": 2, 66 | "cloudWatchLogsConfiguration": { 67 | "logGroupArn": "arn:aws:logs:::log-group:/aws/fis/experiment:*" 68 | } 69 | }, 70 | "tags": { 71 | "Name": "MySQL-RDS-LoadTest-Failover" 72 | }, 73 | "experimentOptions": { 74 | "accountTargeting": "single-account", 75 | "emptyTargetResolutionMode": "fail" 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /aurora-global-region-failover/README.md: -------------------------------------------------------------------------------- 1 | # Aurora Global Database Regional Failover 2 | 3 | This experiment performs Aurora Global Database regional failover/switchover to test disaster recovery procedures and measure RTO/RPO. 4 | 5 | ## Prerequisites 6 | 7 | - Aurora Global Database with primary and secondary clusters 8 | - Global cluster tagged with `FIS-Ready: True` 9 | - IAM roles for FIS and SSM automation 10 | 11 | ## Failover Types 12 | 13 | - **Switchover** (default): Planned operation with no data loss for maintenance or testing 14 | - **Failover**: Emergency operation allowing data loss for disaster recovery 15 | 16 | ## Files 17 | 18 | - `aurora-global-region-failover-automation.yaml` - SSM automation document 19 | - `aurora-global-region-failover-experiment-template.json` - FIS experiment template 20 | - `aurora-global-region-failover-fis-role-iam-policy.json` - IAM policy for FIS role 21 | - `aurora-global-region-failover-ssm-automation-role-iam-policy.json` - IAM policy for SSM role 22 | - `fis-iam-trust-relationship.json` - Trust relationship for FIS role 23 | - `ssm-iam-trust-relationship.json` - Trust relationship for SSM role 24 | 25 | ## Setup 26 | 27 | 1. Create IAM roles: 28 | ```bash 29 | aws iam create-role --role-name --assume-role-policy-document file://fis-iam-trust-relationship.json 30 | aws iam put-role-policy --role-name --policy-name --policy-document file://aurora-global-region-failover-fis-role-iam-policy.json 31 | 32 | aws iam create-role --role-name --assume-role-policy-document file://ssm-iam-trust-relationship.json 33 | aws iam put-role-policy --role-name --policy-name --policy-document file://aurora-global-region-failover-ssm-automation-role-iam-policy.json 34 | ``` 35 | 36 | 2. Create SSM automation document: 37 | ```bash 38 | aws ssm create-document --name aurora-global-region-failover-automation --document-type Automation --content file://aurora-global-region-failover-automation.yaml --document-format YAML 39 | ``` 40 | 41 | 3. Update experiment template with your values and create: 42 | ```bash 43 | # Edit aurora-global-region-failover-experiment-template.json with your account/region/cluster details 44 | aws fis create-experiment-template --cli-input-json file://aurora-global-region-failover-experiment-template.json 45 | ``` 46 | 47 | ## Parameters 48 | 49 | - `globalClusterIdentifier`: Aurora Global Database cluster identifier (required) 50 | - `failoverType`: "switchover" for planned operations or "failover" for emergency with data loss (default: "switchover") 51 | - `AutomationAssumeRole`: IAM role ARN for automation execution (required) 52 | 53 | ## Usage 54 | 55 | Run the FIS experiment to perform a managed failover/switchover of the Aurora Global Database: 56 | 57 | ```bash 58 | aws fis start-experiment --experiment-template-id 59 | ``` 60 | 61 | The experiment will automatically detect the secondary cluster and promote it to primary based on the configured failover type. 62 | -------------------------------------------------------------------------------- /ec2-windows-stop-iis/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: Stopping IIS on Windows EC2 Instance 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Hypothesis 11 | 12 | Our application will remain available and resilient when IIS (Internet Information Services) is stopped on one of our Windows EC2 instances, simulating a scenario where the web server crashes or fails to start. 13 | 14 | ![Stop IIS Experiment](images/ssm.png) 15 | 16 | ## Prerequisites 17 | 18 | Before running this experiment, ensure that: 19 | 20 | 1. You have the necessary permissions to execute the FIS experiment and perform actions on Windows EC2 instances. 21 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the IIS stopping operation. 22 | 3. The Windows EC2 instances you want to target have the `FIS-Ready=True` tag. 23 | 4. SSM Agent is installed and running on the target Windows EC2 instances. 24 | 5. The IAM role associated with the EC2 instances has the necessary permissions for SSM. 25 | 26 | ## Stop Conditions 27 | 28 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until the IIS stopping action has been completed on the targeted resources. 29 | 30 | ## Observability and stop conditions 31 | 32 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 33 | business metric requiring an immediate end of the fault injection. This 34 | template makes no assumptions about your application and the relevant metrics 35 | and does not include stop conditions by default. 36 | 37 | ## Next Steps 38 | As you adapt this scenario to your needs, we recommend: 39 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 40 | 2. Identifying business metrics tied to the IIS service availability. 41 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of stopping IIS. 42 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached. 43 | 5. Implementing proper logging and monitoring to track the behavior of your application when IIS is stopped. 44 | 45 | ## Import Experiment 46 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 47 | -------------------------------------------------------------------------------- /sap-ec2-instance-stop-ascs/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Description 11 | 12 | Explore the impact of the interruption of EC2 Instance which hosts the SAP database. 13 | 14 | In this experiment we target EC2 Instances in the current region that have a specific tag attached. 15 | 16 | ## Hypothesis 17 | 18 | When an interruption occurs on the EC2 Instances hosting the ABAP SAP Central Services (ASCS), the ASCS process will failover to the Standby EC2 instance hosting EnQue Replication Server (ERS). The failover will occur within 5-15 minutes and user can resume operations. This validates SAP application cluster configuration. 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have the necessary permissions to execute the FIS experiment. 25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 26 | 3. All your AWS resources are correctly tagged. 27 | ``` 28 | "FIS-Application": "SAP", 29 | "FIS-Ready": "True", 30 | "FIS-SAP-App-Tier": "Application", 31 | "FIS-SAP-Environment-Type": "Dev", 32 | "FIS-SAP-HA-Node": "Primary", 33 | "FIS-SAP-SID": "S4" 34 | ``` 35 | 36 | ## Stop Conditions 37 | 38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 39 | 40 | ## Observability and stop conditions 41 | 42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 43 | business metric requiring an immediate end of the fault injection. This 44 | template makes no assumptions about your application and the relevant metrics 45 | and does not include stop conditions by default. 46 | 47 | ## Next Steps 48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 49 | 50 | ## Import Experiment 51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). -------------------------------------------------------------------------------- /aurora-cluster-failover/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: Aurora Cluster Failover 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and 4 | fis-template-library-tooling. This experiment template requires deployment into 5 | your AWS account and requires resources in your AWS account to inject faults into. 6 | 7 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 8 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 9 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 10 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | 12 | ## Description 13 | 14 | Explore the impact of failing over of an Amazon Aurora cluster. 15 | 16 | In this experiment we target an Amazon Aurora Cluster in the current region that have a specific tag attached. 17 | 18 | ## Hypothesis 19 | 20 | Failover of an Aurora Cluster between the reader and writer instance may cause requests to fail for a brief period of time, but requests will automatically recover, and the application will continue to function as normal after the failover. 21 | 22 | ## Prerequisites 23 | 24 | Before running this experiment, ensure that: 25 | 26 | 1. You have the necessary permissions to execute the FIS experiment and perform the failover operation on the targeted Aurora clusters. 27 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the failover operation. 28 | 3. The Aurora clusters you want to target have the `FIS-Ready=True` tag. 29 | 4. The targeted Aurora clusters are configured for Multi-AZ deployment with writer and reader instances, and proper replication is set up. 30 | 31 | ## How it works 32 | 33 | This template simulate an Aurora DB cluster failover for a DB cluster. It will promotes one of the Aurora Replicas (read-only instances) in the DB cluster to be the primary DB instance (the cluster writer). To use the scenario you must have Amazon Aurora clusters that have the tag `FIS-Ready=True`. 34 | 35 | ## Observability and stop conditions 36 | 37 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 38 | business metric requiring an immediate end of the fault injection. This 39 | template makes no assumptions about your application and the relevant metrics 40 | and does not include stop conditions by default. 41 | 42 | ## Next Steps 43 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 44 | 45 | ## Import Experiment 46 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). -------------------------------------------------------------------------------- /sap-ebs-pause-database-data/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Description 11 | 12 | Explore the impact of the interruption of EBS Volume which hosts the SAP database. 13 | 14 | In this experiment we target EC2 Instances in the current region that have a specific tag attached. 15 | 16 | ## Hypothesis 17 | 18 | When an interruption occurs on the Block Storage drive attached to EC2 Instances hosting the SAP database, the application will unable to write data causing a failover to occur to the Standby EC2 instance hosted in another AZ. The failover will occur within 15-30 minutes and user can resume operations. Application has requirement of RTO of 30 minutes RPO of near zero. This validates SAP database cluster configuration. 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have the necessary permissions to execute the FIS experiment. 25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 26 | 3. All your AWS resources are correctly tagged. 27 | ``` 28 | "FIS-Application": "SAP", 29 | "FIS-Ready": "True", 30 | "FIS-SAP-App-Tier": "Database", 31 | "FIS-SAP-Database-Type": "Data", 32 | "FIS-SAP-Environment-Type": "Dev", 33 | "FIS-SAP-SID": "S4" 34 | ``` 35 | 36 | ## Stop Conditions 37 | 38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 39 | 40 | ## Observability and stop conditions 41 | 42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 43 | business metric requiring an immediate end of the fault injection. This 44 | template makes no assumptions about your application and the relevant metrics 45 | and does not include stop conditions by default. 46 | 47 | ## Next Steps 48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 49 | 50 | ## Import Experiment 51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 52 | -------------------------------------------------------------------------------- /sap-ec2-instance-stop-database/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Description 11 | 12 | Explore the impact of the stopping of the EC2 Instances which is hosting the SAP database. 13 | 14 | In this experiment we target EC2 Spot Instances in the current region that have a specific tag attached. 15 | 16 | ## Hypothesis 17 | 18 | When an interruption occurs on the EC2 Instances hosting the SAP database, the application will failover to the Standby EC2 instance hosted in another AZ. The failover will occur within 15-30 minutes and user can resume operations. Application has requirement of RTO of 30 minutes RPO of near zero. This validates SAP database cluster configuration. 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have the necessary permissions to execute the FIS experiment. 25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 26 | 3. All your AWS resources are correctly tagged. 27 | ``` 28 | "ec2:ResourceTag/FIS-Application": "SAP", 29 | "ec2:ResourceTag/FIS-Ready": "True", 30 | "ec2:ResourceTag/FIS-SAP-App-Tier": "Database", 31 | "ec2:ResourceTag/FIS-SAP-Environment-Type": "Dev", 32 | "ec2:ResourceTag/FIS-SAP-HA-Node": "Primary", 33 | "ec2:ResourceTag/FIS-SAP-SID": "S4" 34 | ``` 35 | 36 | ## Stop Conditions 37 | 38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 39 | 40 | ## Observability and stop conditions 41 | 42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 43 | business metric requiring an immediate end of the fault injection. This 44 | template makes no assumptions about your application and the relevant metrics 45 | and does not include stop conditions by default. 46 | 47 | ## Next Steps 48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 49 | 50 | ## Import Experiment 51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiments 2 | 3 | This repository contains a collection of AWS Fault Injection Service (FIS) experiments designed to test the resilience and fault tolerance of your AWS resources and applications. These experiments simulate various failure scenarios to help you identify potential vulnerabilities and validate your system's ability to recover from disruptions. 4 | 5 | ## Available Experiments 6 | 7 | Browse the experiment directories to find templates for various fault injection scenarios: 8 | 9 | - **EC2 Instance Management**: `ec2-instances-terminate/`, `ec2-spot-interruption/`, `ec2-windows-stop-iis/` 10 | - **Database Resilience**: `aurora-cluster-failover/`, `sap-ebs-pause-database-data/` 11 | - **SAP Systems**: `sap-ec2-instance-stop-ascs/`, `sap-ec2-instance-stop-database/` 12 | - **Simple Queue Service (SQS)**: `sqs-queue-impairment/` 13 | 14 | Each experiment directory contains: 15 | - Complete FIS experiment template (JSON) 16 | - Required IAM policies and trust relationships 17 | - Comprehensive README with setup instructions 18 | - Additional automation files where applicable 19 | 20 | ## Getting Started 21 | 22 | To use these experiments, follow these steps: 23 | 24 | 1. **Prerequisites**: Ensure you have the necessary permissions and IAM roles configured to run FIS experiments in your AWS account. 25 | 26 | 2. **Choose an Experiment**: Browse the available experiment directories and select one that matches your testing scenario. 27 | 28 | 3. **Review Documentation**: Read the experiment's README.md file thoroughly to understand prerequisites, expected behavior, and safety considerations. 29 | 30 | 4. **Configuration**: Customize the template files by replacing placeholder values (e.g., ``, ``) with your specific AWS account information. 31 | 32 | 5. **Deploy**: Import the experiment template into your AWS account using the [FIS Template Library Tooling](https://github.com/aws-samples/fis-template-library-tooling). 33 | 34 | 6. **Execute Safely**: Run the experiment in a non-production environment first, with proper monitoring and stop conditions in place. 35 | 36 | 7. **Monitor and Analyze**: Observe the impact on your resources and analyze the results to improve your system's resilience. 37 | 38 | ## Contributing 39 | 40 | We welcome contributions of new FIS experiment templates! 41 | 42 | **📋 Before contributing, please read our [Style Guide](STYLE_GUIDE.md) which details all requirements and standards.** 43 | 44 | Key requirements for contributions: 45 | - Follow the standardized directory structure and file naming conventions 46 | - Include comprehensive documentation with safety disclaimers 47 | - Provide complete IAM policies following least privilege principles 48 | - Include observability and monitoring recommendations 49 | - Reference the `ec2-windows-stop-iis/` directory as the gold standard example 50 | 51 | See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed contribution guidelines. 52 | 53 | ## Disclaimer 54 | 55 | These experiments are designed to simulate failure scenarios in your AWS environment. While precautions have been taken to minimize potential risks, running these experiments may cause temporary disruptions or outages to your resources and applications. It is highly recommended to thoroughly review and test the experiments in a non-production environment before running them in a production setting. 56 | -------------------------------------------------------------------------------- /sqs-queue-impairment/sqs-queue-impairment-tag-based-experiment-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Simulate worsening impairment of SQS queues with specific tag by applying deny-all policy for increasing durations", 3 | "targets": {}, 4 | "actions": { 5 | "impairSqs2m": { 6 | "actionId": "aws:ssm:start-automation-execution", 7 | "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 2 minutes", 8 | "parameters": { 9 | "maxDuration": "PT1H", 10 | "documentArn": "arn:aws:ssm:::document/", 11 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT2M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 12 | }, 13 | "targets": {} 14 | }, 15 | "wait3m1": { 16 | "actionId": "aws:fis:wait", 17 | "description": "Wait for 3 minutes", 18 | "parameters": { 19 | "duration": "PT3M" 20 | }, 21 | "targets": {}, 22 | "startAfter": ["impairSqs2m"] 23 | }, 24 | "impairSqs5m": { 25 | "actionId": "aws:ssm:start-automation-execution", 26 | "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 5 minutes", 27 | "parameters": { 28 | "maxDuration": "PT1H", 29 | "documentArn": "arn:aws:ssm:::document/", 30 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT5M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 31 | }, 32 | "targets": {}, 33 | "startAfter": ["wait3m1"] 34 | }, 35 | "wait3m2": { 36 | "actionId": "aws:fis:wait", 37 | "description": "Wait for 3 minutes", 38 | "parameters": { 39 | "duration": "PT3M" 40 | }, 41 | "targets": {}, 42 | "startAfter": ["impairSqs5m"] 43 | }, 44 | "impairSqs7m": { 45 | "actionId": "aws:ssm:start-automation-execution", 46 | "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 7 minutes", 47 | "parameters": { 48 | "maxDuration": "PT1H", 49 | "documentArn": "arn:aws:ssm:::document/", 50 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT7M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 51 | }, 52 | "targets": {}, 53 | "startAfter": ["wait3m2"] 54 | }, 55 | "wait2m1": { 56 | "actionId": "aws:fis:wait", 57 | "description": "Wait for 2 minutes", 58 | "parameters": { 59 | "duration": "PT2M" 60 | }, 61 | "targets": {}, 62 | "startAfter": ["impairSqs7m"] 63 | }, 64 | "impairSqs15m": { 65 | "actionId": "aws:ssm:start-automation-execution", 66 | "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 15 minutes", 67 | "parameters": { 68 | "maxDuration": "PT1H", 69 | "documentArn": "arn:aws:ssm:::document/", 70 | "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT15M\", \"AutomationAssumeRole\": \"arn:aws:iam:::role/\"}" 71 | }, 72 | "targets": {}, 73 | "startAfter": ["wait2m1"] 74 | } 75 | }, 76 | "stopConditions": [ 77 | { 78 | "source": "none" 79 | } 80 | ], 81 | "roleArn": "arn:aws:iam:::role/", 82 | "tags": { 83 | "Name": "SimulateSqsImpairment", 84 | "Purpose": "resilience-testing" 85 | }, 86 | "experimentOptions": { 87 | "accountTargeting": "single-account", 88 | "emptyTargetResolutionMode": "skip" 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /ec2-spot-interruption/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: EC2 Spot Instances Interrupt 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | 10 | ## Description 11 | 12 | Explore the impact of the termination of EC2 Spot Instances. 13 | 14 | In this experiment we target EC2 Spot Instances in the current region that have a specific tag attached. 15 | 16 | ## Hypothesis 17 | 18 | When an interruption occurs on EC2 Spot Instances, instances will gracefully terminate, and applications or services running on those instances will be automatically restarted on new Spot Instances or fallback to On-Demand Instances, ensuring minimal disruption to the overall system. 19 | 20 | Specifically, we expect the following behavior: 21 | 22 | 1. **Graceful Termination**: Upon receiving the interruption signal, EC2 Spot Instances will initiate a graceful termination process, allowing applications or services to perform any necessary cleanup tasks or save their state before terminating. 23 | 24 | 2. **Automatic Restarting**: Applications or services running on the interrupted Spot Instances are configured for automatic restart and will be automatically launched on new Spot Instances or fallback to On-Demand Instances, depending on the defined scaling policies and capacity provisioning strategies. 25 | 26 | 3. **Load Balancing and Failover**: If the applications or services are running behind a load balancer, traffic will be automatically rerouted to the newly launched instances, ensuring seamless failover and minimizing downtime. 27 | 28 | 4. **Data Persistence**: Any persistent data or state associated with the applications or services running on the interrupted Spot Instances will be successfully recovered or replicated to the new instances, ensuring data consistency and integrity. 29 | 30 | 5. **Monitoring and Alerting**: The interruption event and subsequent recovery actions will be captured by the monitoring and alerting systems, providing visibility into the system's behavior and enabling timely incident response and analysis. 31 | 32 | By validating this hypothesis, we can demonstrate the resilience of our applications and services running on EC2 Spot Instances and ensure that they can gracefully handle interruptions while minimizing the impact on end-users or customers. 33 | 34 | ## Prerequisites 35 | 36 | Before running this experiment, ensure that: 37 | 38 | 1. You have the necessary permissions to execute the FIS experiment and perform the termination of EC2 Spot Instance 39 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation. 40 | 3. The EC2 Spot Instance you want to target have the `FIS-Ready=True` tag. 41 | 42 | ## How it works 43 | 44 | The experiment sends an interruption signal to 25% of targeted EC2 Spot Instances using the AWS API `aws:ec2:send-spot-instance-interruptions`. This action simulates a real-world scenario where the Spot Instances are interrupted due to changes in the Spot market or capacity constraints. 45 | 46 | `durationBeforeInterruption`: A duration of 4 minutes (PT4M) is set before the interruption is triggered. This allows for any necessary preparations or cleanup tasks to be executed before the interruption occurs. 47 | 48 | ## Stop Conditions 49 | 50 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted. 51 | 52 | ## Observability and stop conditions 53 | 54 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 55 | business metric requiring an immediate end of the fault injection. This 56 | template makes no assumptions about your application and the relevant metrics 57 | and does not include stop conditions by default. 58 | 59 | ## Next Steps 60 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm. 61 | 62 | ## Import Experiment 63 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 64 | -------------------------------------------------------------------------------- /dynamodb-traffic-blackhole-region-impairment/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: DynamoDB Traffic Blackhole Region Impairment 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When network connectivity to DynamoDB is completely blocked from my application subnets, monitoring systems should detect the connectivity failure within 2-3 minutes and trigger alerts. The DevOps team should be notified within 5 minutes through our alerting channels. If automated failover is configured, it should activate within 10 minutes. For manual intervention, the team should acknowledge the incident within 15 minutes and complete failover procedures within 30-45 minutes. All DynamoDB operations should fail with network timeout errors during the 10-minute impairment period. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Network-level DynamoDB connectivity monitoring and alerting works correctly 14 | * Application timeout and retry logic handles network failures appropriately 15 | * Circuit breaker patterns function as expected for DynamoDB connectivity issues 16 | * Graceful degradation or failover mechanisms activate when DynamoDB is unreachable 17 | * Error handling and user experience during complete DynamoDB network blackouts 18 | * Recovery behavior when network connectivity is restored 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have created the IAM role for FIS with the provided policy document 25 | 2. You have created the FIS Experiment Template from the sample provided 26 | 3. **Update the AWS account ID** in the template files to match your account 27 | 4. The EC2 subnets containing your application instances have the "FIS-Ready":"True" tag 28 | 5. Your application instances are running in the tagged subnets and actively using DynamoDB 29 | 6. You have appropriate monitoring and observability in place to track the impact 30 | 31 | ## How it works 32 | 33 | This experiment uses the `aws:network:disrupt-connectivity` action with `scope: dynamodb` to block all network traffic between your application subnets and the DynamoDB regional endpoints. 34 | 35 | ### Network ACL Mechanism 36 | 37 | FIS temporarily: 38 | 1. Clones the existing network ACL associated with target subnets 39 | 2. Adds deny rules to block DynamoDB traffic in the cloned ACL 40 | 3. Associates the modified ACL with your subnets for the experiment duration 41 | 4. Automatically restores the original ACL when the experiment completes 42 | 43 | ### Duration and Scope 44 | 45 | - **Duration**: 10 minutes (configurable via `duration` parameter) 46 | - **Scope**: DynamoDB regional endpoints only - other AWS services remain accessible 47 | - **Traffic Blocked**: All inbound and outbound DynamoDB API calls from target subnets 48 | - **Intra-subnet**: Traffic between instances in the same subnet remains unaffected 49 | 50 | ## Target Resources 51 | 52 | This experiment targets EC2 subnets tagged with `FIS-Ready: True`. All instances in these subnets will lose DynamoDB connectivity during the experiment. 53 | 54 | ## Stop Conditions 55 | 56 | The experiment includes basic stop conditions. Consider adding CloudWatch alarms for: 57 | - Application error rates exceeding thresholds 58 | - Critical business metrics falling below acceptable levels 59 | - Infrastructure health checks failing 60 | 61 | ## Observability Recommendations 62 | 63 | Monitor these metrics during the experiment: 64 | - DynamoDB API call success/failure rates 65 | - Application error logs and exception counts 66 | - Network connectivity metrics from application instances 67 | - Circuit breaker state changes 68 | - User experience and transaction success rates 69 | 70 | ## Safety Considerations 71 | 72 | - Test in non-production environments first 73 | - Ensure your application can handle DynamoDB connectivity failures gracefully 74 | - Have rollback procedures ready if manual intervention is needed 75 | - Consider the impact on dependent services and downstream systems 76 | - Verify that critical business processes have appropriate fallback mechanisms 77 | 78 | ## Files Included 79 | 80 | - `dynamodb-traffic-blackhole-region-impairment-template.json` - FIS experiment template 81 | - `dynamodb-traffic-blackhole-region-impairment-iam-policy.json` - Required IAM permissions 82 | - `fis-iam-trust-relationship.json` - IAM trust relationship for FIS role 83 | - `AWSFIS.json` - Template version marker 84 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Primary Node Reboot 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When the Redis primary node is rebooted, applications should detect the brief connection disruption and reconnect automatically within 30 seconds. Connection pooling should handle the temporary unavailability gracefully, and no data should be lost during the reboot. Application performance should return to normal within 60 seconds of the node becoming available again. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Appropriate Redis connection monitoring and observability is in place (were you able to detect the reboot?) 14 | * Alarms are configured correctly for node availability changes (were the right people notified?) 15 | * Your application handles brief Redis connection disruptions gracefully 16 | * Connection pooling and retry logic work correctly during node reboots 17 | * Recovery controls and reconnection mechanisms work as expected 18 | 19 | ## Prerequisites 20 | 21 | Before running this experiment, ensure that: 22 | 23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided. 24 | 2. You have created the SSM Automation Document from the sample provided (elasticache-redis-primary-node-reboot-automation.yaml) 25 | 3. You have created the FIS Experiment Template from the sample provided (elasticache-redis-primary-node-reboot-experiment-template.json) 26 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value 27 | 5. Your Redis cluster has Multi-AZ enabled with `AutomaticFailover=enabled` 28 | 6. You have appropriate monitoring and observability in place to track the impact of the experiment. 29 | 30 | ## How it works 31 | 32 | This experiment reboots the Redis primary node to test application resilience during brief connection disruptions. The experiment follows this sequence: 33 | 34 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True" 35 | 2. **Primary Identification**: Dynamically finds the current primary node using NodeGroups and CurrentRole 36 | 3. **Node Reboot**: Executes `reboot_cache_cluster` on the primary node 37 | 4. **Recovery Monitoring**: Tracks node status from "Rebooting cache cluster nodes" to "Available" 38 | 39 | The reboot is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document identifies the primary node and reboots it, then monitors the recovery process until the node returns to available status. 40 | 41 | To verify the experiment is working properly, you can monitor the node status and test connectivity: 42 | 43 | ```bash 44 | # Monitor Redis connectivity during reboot 45 | watch -n 5 'redis-cli -h ping' 46 | 47 | # Check node status 48 | aws elasticache describe-replication-groups --replication-group-id --query 'ReplicationGroups[0].NodeGroups[0].NodeGroupMembers[?CurrentRole==`primary`].CacheNodeStatus' 49 | 50 | # Monitor application health 51 | curl -I https:///health 52 | ``` 53 | 54 | During the experiment, you should see the node status change from "Available" to "Rebooting cache cluster nodes" and back to "Available" within 1-3 minutes. 55 | 56 | ## Stop Conditions 57 | 58 | The experiment does not have any specific stop conditions defined. The reboot completes automatically when the node returns to "Available" status. 59 | 60 | ## Observability and stop conditions 61 | 62 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default. 63 | 64 | ## Next Steps 65 | 66 | As you adapt this scenario to your needs, we recommend: 67 | 68 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 69 | 2. Identifying business metrics tied to your Redis operations, such as connection counts and application response times. 70 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis node availability and connection health. 71 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed. 72 | 5. Implementing appropriate connection retry logic in your application to handle brief node unavailability. 73 | 6. Testing your application's Redis connection pooling and recovery mechanisms. 74 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly. 75 | 76 | ## Import Experiment 77 | 78 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 79 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing FIS Experiment Templates 24 | 25 | This repository contains AWS Fault Injection Service (FIS) experiment templates. When contributing new templates or modifying existing ones, please follow our comprehensive style guide. 26 | 27 | **📋 [Read the complete FIS Template Style Guide](STYLE_GUIDE.md) before contributing** 28 | 29 | ### Template Contribution Checklist 30 | 31 | Before submitting a FIS template, ensure you have: 32 | 33 | - [ ] **Reviewed the style guide**: Read [STYLE_GUIDE.md](STYLE_GUIDE.md) thoroughly 34 | - [ ] **Used the correct structure**: Follow the required directory and file naming conventions 35 | - [ ] **Included all required files**: README.md, AWSFIS.json, template JSON, IAM policy, and trust relationship 36 | - [ ] **Validated JSON files**: All JSON must be valid and properly formatted 37 | - [ ] **Included safety disclaimers**: Use exact disclaimer text as specified 38 | - [ ] **Written comprehensive documentation**: Include hypothesis, prerequisites, and next steps 39 | - [ ] **Followed security best practices**: IAM policies use least privilege with resource tag conditions 40 | - [ ] **Used proper parameterization**: Replace account-specific values with `` placeholders 41 | - [ ] **Added CloudWatch recommendations**: Include observability guidance in next steps 42 | - [ ] **Referenced the gold standard**: Compare your template against `ec2-windows-stop-iis/` example 43 | - [ ] **SSM document compliance**: If using SSM documents, follow comprehensive SSM best practices in the style guide 44 | 45 | ### Template Testing Requirements 46 | 47 | Before submission, verify your template: 48 | 49 | 1. **JSON validation**: Use a JSON validator on all `.json` files 50 | 2. **Markdown validation**: Check formatting with a markdown linter 51 | 3. **Deployment testing**: Test the template in a sandbox AWS environment 52 | 4. **Documentation accuracy**: Verify all instructions are clear and complete 53 | 5. **Security review**: Confirm IAM policies follow least privilege principles 54 | 55 | ## Contributing via Pull Requests 56 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 57 | 58 | 1. You are working against the latest source on the *main* branch. 59 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 60 | 3. **For FIS templates**: You have completed the template checklist above and reviewed the [style guide](STYLE_GUIDE.md). 61 | 4. You open an issue to discuss any significant work - we would hate for your time to be wasted. 62 | 63 | To send us a pull request, please: 64 | 65 | 1. Fork the repository. 66 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 67 | 3. **For FIS templates**: Ensure your template follows the [style guide](STYLE_GUIDE.md) requirements. 68 | 4. Ensure local tests pass. 69 | 5. Commit to your fork using clear commit messages. 70 | 6. Send us a pull request, answering any default questions in the pull request interface. 71 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 72 | 73 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 74 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 75 | 76 | 77 | ## Finding contributions to work on 78 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 79 | 80 | 81 | ## Code of Conduct 82 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 83 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 84 | opensource-codeofconduct@amazon.com with any additional questions or comments. 85 | 86 | 87 | ## Security issue notifications 88 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 89 | 90 | 91 | ## Licensing 92 | 93 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 94 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Connection Failure 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When Redis connections are disrupted, applications should gracefully handle the failure through circuit breaker mechanisms within 30 seconds. Client retry storms should be prevented, and applications should continue operating in degraded mode without cascading failures. Once Redis connectivity is restored, normal operations should resume within 60 seconds. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Appropriate Redis connectivity monitoring and observability is in place (were you able to detect the connection failure?) 14 | * Alarms are configured correctly for connectivity issues (were the right people notified?) 15 | * Your applications handle Redis unavailability gracefully without cascading failures 16 | * Redis client circuit breaker functionality works correctly 17 | * Client-side retry logic doesn't create amplification effects or retry storms 18 | * Recovery controls work as expected when Redis connectivity is restored 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided. 25 | 2. You have created the SSM Automation Document from the sample provided (redis-connection-failure-automation.yaml) 26 | 3. You have created the FIS Experiment Template from the sample provided (redis-connection-failure-experiment-template.json) 27 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value 28 | 5. Your applications implement proper Redis client circuit breakers and retry logic 29 | 6. You have appropriate monitoring and observability in place to track the impact of the experiment. 30 | 31 | ## How it works 32 | 33 | This experiment simulates Redis connection failures by modifying ElastiCache security groups to block connections for a specified duration. The experiment follows this sequence: 34 | 35 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True" 36 | 2. **Connection Disruption**: Removes security group rules to block Redis access from applications 37 | 3. **Sustained Failure**: Maintains connection disruption for specified duration to test resilience 38 | 4. **Restoration**: Restores security group rules to resume normal connectivity 39 | 40 | The connection failure is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document modifies security group rules to block access to Redis, then restores connectivity after the specified duration. 41 | 42 | To verify the experiment is working properly, you can monitor Redis connectivity and application behavior: 43 | 44 | ```bash 45 | # Monitor Redis connectivity 46 | watch -n 5 'redis-cli -h ping' 47 | 48 | # Check application health endpoints 49 | curl -I https:///health 50 | 51 | # Monitor security group rules 52 | aws ec2 describe-security-groups --group-ids --query 'SecurityGroups[0].IpPermissions' 53 | ``` 54 | 55 | During the experiment, you should observe connection timeouts when attempting to reach Redis and applications activating circuit breakers or degraded mode operations. 56 | 57 | ## Stop Conditions 58 | 59 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped. 60 | 61 | ## Observability and stop conditions 62 | 63 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default. 64 | 65 | ## Next Steps 66 | 67 | As you adapt this scenario to your needs, we recommend: 68 | 69 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 70 | 2. Identifying business metrics tied to your Redis connectivity, such as cache hit rates and application error rates. 71 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis connectivity and circuit breaker activation. 72 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed. 73 | 5. Implementing appropriate circuit breakers in your application to handle Redis unavailability gracefully. 74 | 6. Testing your application's behavior under various connection failure scenarios and durations. 75 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly. 76 | 77 | ## Import Experiment 78 | 79 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 80 | -------------------------------------------------------------------------------- /sqs-queue-impairment/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: SQS Queue Impairment 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When the SQS service is experiencing an impairment in a region which impacts my application, an alarm should be raised and the DevOps team notified within 5 minutes. Functionality relating to component A should not be available to end users during the impairment; however, other components should continue to operate normally. Once the SQS impairment has been resolved, component A should become available to end users within 5 minutes. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Appropriate customer experience metrics and observability of SQS is in place (were you able to detect there was a problem?) 14 | * Alarms are configured correctly (were the right people notified and/or automations triggered?) 15 | * Your app gracefully degrades and customers aren't submitting transactions which you know will fail 16 | * Your circuit breaker (if any) works as expected 17 | * Recovery controls (if any) work as expected 18 | 19 | ## Prerequisites 20 | 21 | Before running this experiment, ensure that: 22 | 23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided. 24 | 2. You have created the SSM Automation Document from the sample provided (sqs-queue-impairment-tag-based-automation.yaml) 25 | 3. You have created the FIS Experiment Template from the sample provided (sqs-queue-impairment-tag-based-template.json) 26 | 3. The SQS queue(s) you want to target have the "FIS-Ready":"True" tag and value 27 | 5. You have appropriate monitoring and observability in place to track the impact of the experiment. 28 | 29 | ## How it works 30 | 31 | This experiment simulates a worsening impairment of an SQS queue by applying a deny-all policy that blocks access to the queue for increasing durations. The experiment follows this sequence: 32 | 33 | 1. First impairment: Blocks access to the SQS queue for 2 minutes 34 | 2. Wait period: 3 minutes of normal operation 35 | 3. Second impairment: Blocks access to the SQS queue for 5 minutes 36 | 4. Wait period: 3 minutes of normal operation 37 | 5. Third impairment: Blocks access to the SQS queue for 7 minutes 38 | 6. Wait period: 2 minutes of normal operation 39 | 7. Fourth impairment: Blocks access to the SQS queue for 15 minutes 40 | 41 | The impairment is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document adds a deny statement to the SQS queue policy that prevents all principals from performing key operations like sending and receiving messages. After the specified duration, the Automation Document removes the deny statement, restoring normal access to the queue. 42 | 43 | To verify the experiment is setup and working properly, you can use the AWS CLI to attempt operations on a targeted SQS queue: 44 | 45 | ```bash 46 | watch -n 5 'aws sqs send-message --queue-url "https://sqs..amazonaws.com//" --message-body "This is a test message" --region --no-cli-pager' 47 | ``` 48 | 49 | During the impairment periods, you should see "AccessDenied" errors when attempting to send or receive messages from the queue. 50 | 51 | ![FIS Console showing actions](./images/sqs.png "FIS Console showing actions") 52 | 53 | ## Stop Conditions 54 | 55 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped. 56 | 57 | ## Observability and stop conditions 58 | 59 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default. 60 | 61 | ## Next Steps 62 | 63 | As you adapt this scenario to your needs, we recommend: 64 | 65 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 66 | 2. Identifying business metrics tied to your SQS queue processing, such as application transaction rates. 67 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of the SQS impairment. 68 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached. 69 | 5. Implementing appropriate circuit breakers in your application to handle SQS service impairments gracefully. 70 | 6. Testing your application's recovery mechanisms to ensure they work as expected after the SQS service is restored. 71 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly. 72 | 73 | ## Import Experiment 74 | 75 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 76 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-failover/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Primary Node Failover 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When the Redis primary node fails over to a replica, applications should detect the failover and reconnect to the new primary within 30 seconds. Connection pooling should handle the DNS endpoint changes gracefully, and no data should be lost during the transition. Application performance should return to normal within 60 seconds of failover completion. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Appropriate Redis connection monitoring and observability is in place (were you able to detect the failover?) 14 | * Alarms are configured correctly for primary node changes (were the right people notified?) 15 | * Your application handles Redis primary node changes gracefully 16 | * Connection pooling and DNS resolution work correctly during failover 17 | * Recovery controls and reconnection logic work as expected 18 | 19 | ## Prerequisites 20 | 21 | Before running this experiment, ensure that: 22 | 23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided. 24 | 2. You have created the SSM Automation Document from the sample provided (elasticache-redis-primary-node-failover-automation.json) 25 | 3. You have created the FIS Experiment Template from the sample provided (elasticache-redis-primary-node-failover-experiment-template.json) 26 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value 27 | 5. Your Redis cluster has Multi-AZ enabled with `AutomaticFailover=enabled` 28 | 6. Your cluster has at least 1 primary + 1 replica node 29 | 7. You have appropriate monitoring and observability in place to track the impact of the experiment. 30 | 31 | ## How it works 32 | 33 | This experiment forces a Redis primary node failover by using the ElastiCache TestFailover API to promote a replica node to primary. The experiment follows this sequence: 34 | 35 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True" 36 | 2. **Validation**: Ensures the cluster has `AutomaticFailover=enabled` and is in `available` status 37 | 3. **Failover Trigger**: Uses `test_failover` API to promote a replica to primary role 38 | 4. **DNS Update**: ElastiCache automatically updates the master endpoint to point to the new primary 39 | 5. **Role Swap**: The former primary becomes a replica, and the replica becomes the new primary 40 | 41 | The failover is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document uses the ElastiCache TestFailover API to trigger the failover process, which automatically promotes a replica to primary and updates the DNS endpoint. 42 | 43 | To verify the experiment is working properly, you can monitor the primary node before and after: 44 | 45 | ```bash 46 | # Check current primary before experiment 47 | aws elasticache describe-replication-groups --replication-group-id --query 'ReplicationGroups[0].NodeGroups[0].NodeGroupMembers[?CurrentRole==`primary`].CacheClusterId' 48 | 49 | # Monitor during experiment 50 | watch -n 5 'aws elasticache describe-replication-groups --replication-group-id --query "ReplicationGroups[0].Status"' 51 | ``` 52 | 53 | During the failover, you should see the cluster status change from "available" to "modifying" and back to "available" as the primary node changes. 54 | 55 | ## Stop Conditions 56 | 57 | The experiment does not have any specific stop conditions defined. The failover completes automatically when the TestFailover operation finishes and the cluster returns to "available" status. 58 | 59 | ## Observability and stop conditions 60 | 61 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default. 62 | 63 | ## Next Steps 64 | 65 | As you adapt this scenario to your needs, we recommend: 66 | 67 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 68 | 2. Identifying business metrics tied to your Redis operations, such as cache hit rates and application response times. 69 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis failover impact. 70 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed. 71 | 5. Implementing appropriate connection retry logic in your application to handle primary node changes. 72 | 6. Testing your application's Redis connection pooling and DNS resolution during failover scenarios. 73 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly. 74 | 75 | ## Import Experiment 76 | 77 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 78 | -------------------------------------------------------------------------------- /mysql-rds-loadtest-failover/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: MySQL RDS Load Test and Failover 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE 9 | 10 | ## Hypothesis 11 | 12 | When high CPU load is generated on a Multi-AZ MySQL RDS instance followed by a failover event, the system will transition from the primary to the standby instance with approximately 25 seconds of downtime, and applications implementing proper connection handling will automatically reconnect with a success rate of nearly 100%, maintaining normal functionality once the failover process completes. 13 | 14 | ## Prerequisites 15 | 16 | Before running this experiment, ensure that: 17 | 18 | 1. You have the necessary permissions to execute the FIS experiment and perform the failover operation on RDS instances. 19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the failover operation and execute SSM documents. 20 | 3. The MySQL RDS instances you want to target have the `FIS-Ready=True` tag. 21 | 4. **You have an EC2 instance tagged with `FIS-Ready=True` that serves as the load generator** 22 | - This instance must have network connectivity to your MySQL RDS instance 23 | - The instance must have the SSM Agent installed and running 24 | - The instance requires appropriate IAM permissions to execute SSM documents 25 | - The instance will execute CPU-intensive database queries against the MySQL RDS instance 26 | 5. The targeted MySQL RDS instances are configured for Multi-AZ deployment. 27 | 6. The IAM role associated with the EC2 instances has the necessary permissions for SSM. 28 | 7. You have deployed the SSM document template (`mysql-rds-loadtest-failover-ssm-template.json`) to your account. 29 | 30 | ## Architecture Overview 31 | 32 | This experiment uses the following components: 33 | 34 | - **MySQL RDS Instance**: The target database that will experience CPU load and failover 35 | - **EC2 Load Generator Instance**: Executes the SSM document to generate database load 36 | - **SSM Document**: Contains the load testing scripts that create CPU-intensive queries 37 | - **FIS Experiment**: Orchestrates the load generation and failover sequence 38 | 39 | **Critical**: The EC2 instance acts as the load generator and must be able to connect to your MySQL RDS instance. The SSM document will be executed on this instance, not directly on the RDS instance. 40 | 41 | ## EC2 Instance Setup 42 | 43 | Your EC2 instance must meet these requirements: 44 | 45 | 1. **Network Access**: Security groups must allow outbound connections to MySQL RDS on port 3306 46 | 2. **MySQL Client**: Install `mysql-client` or equivalent for database connectivity 47 | 3. **SSM Agent**: Ensure SSM Agent is installed and the instance appears in Systems Manager 48 | 4. **IAM Role**: Attach an IAM role with `AmazonSSMManagedInstanceCore` policy 49 | 5. **Tagging**: Tag the instance with `FIS-Ready=True` 50 | 51 | Test connectivity before running the experiment: 52 | ```bash 53 | mysql -h your-rds-endpoint -u your-username -p -e "SELECT 1;" 54 | ``` 55 | 56 | ## ⚠️ Database Impact Warning 57 | 58 | **IMPORTANT**: This experiment will create test tables in your MySQL database: 59 | 60 | ### Tables Created: 61 | - `loadtest` - Load testing table with auto-increment primary key 62 | - Test database (if `DBName` parameter specifies a non-existing database) 63 | 64 | ### Impact: 65 | - Tables will persist after the experiment completes 66 | - Test data will be inserted during load testing 67 | - No existing data will be modified or deleted 68 | - Tables use `IF NOT EXISTS` clauses to avoid conflicts 69 | 70 | ### Cleanup: 71 | If you need to remove the test table after the experiment, you can manually drop it: 72 | ```sql 73 | DROP TABLE IF EXISTS loadtest; 74 | ``` 75 | 76 | ## Stop Conditions 77 | 78 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all actions have been completed on the targeted resources. 79 | 80 | ## Observability and stop conditions 81 | 82 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 83 | business metric requiring an immediate end of the fault injection. This 84 | template makes no assumptions about your application and the relevant metrics 85 | and does not include stop conditions by default. 86 | 87 | ## Next Steps 88 | 89 | As you adapt this scenario to your needs, we recommend: 90 | 91 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 92 | 2. Identifying business metrics tied to your MySQL RDS instance performance. 93 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of high CPU load and failover. 94 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached. 95 | 5. Customizing the SSM document parameters to adjust load test concurrency, duration, and target CPU utilization. 96 | 6. Testing the load generation script independently before running the full FIS experiment. 97 | 98 | ## Import Experiment 99 | 100 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 101 | 102 | ## Monitoring Recommendations 103 | 104 | For optimal experiment observability, consider monitoring these key metrics during execution: 105 | - RDS CPU Utilization (target: sustained high load before failover) 106 | - RDS Database Connections (monitor connection drops during failover) 107 | - Application response times and error rates 108 | - RDS Failover completion time via CloudWatch Events 109 | -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: Aurora Cluster CPU Overload and Failover 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Description 8 | 9 | This experiment simulates CPU overload on an Aurora PostgreSQL cluster and then initiates a failover to test the resilience of your database infrastructure under stress conditions. 10 | 11 | ## Hypothesis 12 | 13 | When high CPU load occurs on an Aurora cluster followed by a subsequent failover, the system will restore normal operation with minimal disruption, and the application's functionality will remain largely unaffected. The automatic recovery process will complete within minutes, and the system's request processing capability will maintain continuity at near 100% efficiency after the failover completes. 14 | 15 | ## Prerequisites 16 | 17 | Before running this experiment, ensure that: 18 | 19 | 1. You have an Aurora PostgreSQL cluster tagged with `FIS-Ready=True` 20 | 2. **You have an EC2 instance tagged with `FIS-Ready=True` that serves as the load generator** 21 | - This instance must have network connectivity to your Aurora cluster 22 | - The instance must have the SSM Agent installed and running 23 | - The instance requires appropriate IAM permissions to execute SSM documents 24 | - The instance will execute CPU-intensive database queries against the Aurora cluster 25 | 3. The Aurora cluster is configured for Multi-AZ deployment with writer and reader instances 26 | 4. You have created the required IAM role with the provided policy document 27 | 5. You have deployed the SSM document for load testing 28 | 6. You have configured appropriate CloudWatch monitoring and alarms 29 | 30 | ## Architecture Overview 31 | 32 | This experiment uses the following components: 33 | 34 | - **Aurora PostgreSQL Cluster**: The target database that will experience CPU load and failover 35 | - **EC2 Load Generator Instance**: Executes the SSM document to generate database load 36 | - **SSM Document**: Contains the load testing scripts that create CPU-intensive queries 37 | - **FIS Experiment**: Orchestrates the load generation and failover sequence 38 | 39 | **Critical**: The EC2 instance acts as the load generator and must be able to connect to your Aurora cluster. The SSM document will be executed on this instance, not directly on the Aurora cluster. 40 | 41 | ## EC2 Instance Setup 42 | 43 | Your EC2 instance must meet these requirements: 44 | 45 | 1. **Network Access**: Security groups must allow outbound connections to Aurora on port 5432 46 | 2. **PostgreSQL Client**: Install `postgresql-client` or equivalent for database connectivity 47 | 3. **SSM Agent**: Ensure SSM Agent is installed and the instance appears in Systems Manager 48 | 4. **IAM Role**: Attach an IAM role with `AmazonSSMManagedInstanceCore` policy 49 | 5. **Tagging**: Tag the instance with `FIS-Ready=True` 50 | 51 | Test connectivity before running the experiment: 52 | ```bash 53 | psql -h your-aurora-endpoint -U your-username -d your-database -c "SELECT 1;" 54 | ``` 55 | 56 | ## ⚠️ Database Impact Warning 57 | 58 | **IMPORTANT**: This experiment will create test tables in your Aurora PostgreSQL database: 59 | 60 | ### Tables Created: 61 | - `load_test_users` - User records with status and timestamps 62 | - `load_test_transactions` - Transaction records with foreign key relationships 63 | 64 | ### Impact: 65 | - Tables will persist after the experiment completes 66 | - Test data will be inserted during load testing 67 | - No existing data will be modified or deleted 68 | - Tables use `IF NOT EXISTS` clauses to avoid conflicts 69 | - Indexes will be created for performance testing 70 | 71 | ### Cleanup: 72 | If you need to remove the test tables after the experiment, you can manually drop them: 73 | ```sql 74 | DROP TABLE IF EXISTS load_test_transactions; 75 | DROP TABLE IF EXISTS load_test_users; 76 | ``` 77 | 78 | ## How it works 79 | 80 | This experiment simulates a high CPU load scenario followed by an Aurora DB cluster failover: 81 | 82 | 1. **Baseline establishment**: 5-minute delay to establish baseline metrics 83 | 2. **CPU load generation**: SSM document executes CPU-intensive queries on the Aurora cluster 84 | 3. **Failover initiation**: After the delay, promotes an Aurora Replica to be the primary writer 85 | 4. **Impact observation**: Load test continues to observe failover impact on performance 86 | 87 | The experiment targets resources with the `FIS-Ready=True` tag for safety and control. 88 | 89 | ## Observability and stop conditions 90 | 91 | This template does not include stop conditions by default. You should add CloudWatch alarms based on your specific operational metrics to automatically halt the experiment if critical thresholds are breached. 92 | 93 | ## Files included 94 | 95 | - `aurora-cluster-failover-template.json`: Main FIS experiment template 96 | - `aurora-cluster-failover-iam-policy.json`: Required IAM permissions 97 | - `aurora-cluster-failover-ssm-template.json`: SSM document for load testing 98 | - `fis-iam-trust-relationship.json`: IAM trust policy for FIS service 99 | 100 | ## Next steps 101 | 102 | 1. Review and customize the experiment parameters for your environment 103 | 2. Set up CloudWatch monitoring and create appropriate alarms 104 | 3. Add stop conditions based on your operational metrics 105 | 4. Test the experiment in a non-production environment first 106 | 5. Create a CloudWatch dashboard to visualize experiment effects 107 | 108 | ## Import experiment 109 | 110 | You can import the JSON experiment template into your AWS account via CLI or AWS CDK. For step-by-step instructions, see the [fis-template-library-tooling](https://github.com/aws-samples/fis-template-library-tooling) repository. 111 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/README.md: -------------------------------------------------------------------------------- 1 | # AWS Fault Injection Service Experiment: DynamoDB Region Impairment 2 | 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into. 4 | 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | 7 | ## Example Hypothesis 8 | 9 | When DynamoDB experiences a complete regional failure in us-east-1 which impacts my global table application, an alarm should be raised and the DevOps team notified within 5 minutes. The application should automatically failover to the us-west-2 replica within 2 minutes. During the impairment, all read and write operations should be redirected to the healthy region. Once the regional failure is resolved, the application should resume normal cross-region operation within 5 minutes. 10 | 11 | ### What does this enable me to verify? 12 | 13 | * Appropriate customer experience metrics and observability of DynamoDB global tables is in place (were you able to detect there was a problem?) 14 | * Alarms are configured correctly (were the right people notified and/or automations triggered?) 15 | * Your app gracefully fails over to the healthy region and customers can continue using the application 16 | * Your circuit breaker (if any) works as expected for regional failures 17 | * Recovery controls (if any) work as expected when the region comes back online 18 | * Cross-region replication monitoring and alerting functions correctly 19 | 20 | ## Prerequisites 21 | 22 | Before running this experiment, ensure that: 23 | 24 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided. 25 | 2. You have created the SSM Automation Document from the sample provided (dynamodb-region-impairment-automation.yaml) 26 | 3. You have created the FIS Experiment Template from the sample provided (dynamodb-region-impairment-experiment-template.json) 27 | 4. **Update all region references** in the template files to match your target region (currently set to us-east-1 as an example) 28 | 5. **Update table names** in the template files to match your DynamoDB global table names 29 | 6. The DynamoDB global table(s) you want to target have the "FIS-Ready":"True" tag and value 30 | 7. You have appropriate monitoring and observability in place to track the impact of the experiment. 31 | 32 | ## How it works 33 | 34 | This experiment simulates a complete regional DynamoDB failure by combining two complementary actions: 35 | 36 | ### Timeline 37 | - **T+0**: Both actions start simultaneously 38 | - **T+10s**: SSM automation applies application-blocking policy (after FIS policy is established) 39 | - **T+10m**: SSM automation completes and cleans up its policy statements 40 | - **T+12m**: FIS built-in action completes and auto-expires its policy statements 41 | 42 | **Note**: The durations can be modified to fit your testing needs, but the staggered timing should be maintained to prevent race conditions and ensure proper cleanup sequencing. 43 | 44 | ### Actions 45 | 46 | **1. Native FIS Action (aws:dynamodb:global-table-pause-replication)** 47 | - Blocks DynamoDB replication service from synchronizing data between regions 48 | - Duration: 12 minutes 49 | - Uses time-based auto-expiring resource policy statements 50 | - Automatically cleans up when experiment completes 51 | 52 | **2. Custom SSM Automation (blockDynamoDBAccess)** 53 | - Blocks all application access (reads/writes) to the table in the target region 54 | - Duration: 10 minutes with 10-second initial delay to avoid race conditions 55 | - Uses resource policy with role exclusions for FIS, SSM, and DynamoDB service roles 56 | - Includes proper cleanup logic to remove only its policy statements 57 | 58 | ### Race Condition Prevention 59 | Both actions start simultaneously but modify the same DynamoDB resource policy. A 10-second sleep was added at the start of the SSM automation document to prevent race conditions - allowing the built-in FIS action to successfully apply its policy first. 60 | 61 | ### Recovery Window Testing 62 | This creates a 2-minute recovery window (minutes 10-12) where application access is restored but replication remains paused, allowing testing of partial recovery scenarios and cross-region failover behavior. 63 | 64 | To verify the experiment is setup and working properly, you can use the AWS CLI to attempt operations on a targeted DynamoDB table: 65 | 66 | ```bash 67 | # Test application access (should fail during impairment) 68 | watch -n 5 'aws dynamodb put-item --table-name my-global-table --item "{\"id\":{\"S\":\"test-$(date +%s)\"},\"message\":{\"S\":\"test message\"}}" --region us-east-1 --no-cli-pager' 69 | 70 | # Test reads (should also fail during impairment) 71 | watch -n 5 'aws dynamodb get-item --table-name my-global-table --key "{\"id\":{\"S\":\"test-item\"}}" --region us-east-1 --no-cli-pager' 72 | 73 | # Test failover region (should continue working) 74 | watch -n 5 'aws dynamodb put-item --table-name my-global-table --item "{\"id\":{\"S\":\"test-$(date +%s)\"},\"message\":{\"S\":\"test message\"}}" --region us-west-2 --no-cli-pager' 75 | ``` 76 | 77 | During the impairment periods, you should see "AccessDenied" errors when attempting operations on the us-east-1 table, while us-west-2 operations continue normally. 78 | 79 | ## Stop Conditions 80 | 81 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped. 82 | 83 | ## Observability and stop conditions 84 | 85 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default. 86 | 87 | ## Next Steps 88 | 89 | As you adapt this scenario to your needs, we recommend: 90 | 91 | 1. Reviewing the tag names you use to ensure they fit your specific use case. 92 | 2. Identifying business metrics tied to your DynamoDB global table operations, such as application transaction rates and cross-region latency. 93 | 3. Creating Amazon CloudWatch metrics and alarms to monitor: 94 | - Application error rates during regional failures 95 | - Cross-region failover time 96 | - Data consistency after recovery 97 | - Replication lag between regions 98 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if unacceptable impact occurs. 99 | 5. Implementing appropriate circuit breakers in your application to handle regional DynamoDB failures gracefully. 100 | 6. Testing your application's regional failover mechanisms to ensure they work as expected. 101 | 7. Validating that your monitoring can distinguish between planned chaos experiments and real outages. 102 | 8. Documenting the findings from your experiment and updating your incident response procedures accordingly. 103 | 9. Testing recovery procedures to ensure applications properly resume cross-region operations after the experiment. 104 | 105 | ## Import Experiment 106 | 107 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 108 | -------------------------------------------------------------------------------- /sqs-queue-impairment/sqs-queue-impairment-tag-based-automation.yaml: -------------------------------------------------------------------------------- 1 | description: "Apply a deny-all policy to SQS queues with specific tag to simulate impairment using SQS:AddPermission and SQS:RemovePermission" 2 | schemaVersion: "0.3" 3 | assumeRole: "{{ AutomationAssumeRole }}" 4 | parameters: 5 | tagKey: 6 | type: String 7 | description: "Tag key to identify SQS queues to impair" 8 | default: "FIS-Ready" 9 | tagValue: 10 | type: String 11 | description: "Tag value to identify SQS queues to impair" 12 | default: "True" 13 | duration: 14 | type: String 15 | description: "Duration of the impairment in ISO8601 format" 16 | default: "PT10M" 17 | region: 18 | type: String 19 | description: "AWS Region of the SQS queues" 20 | default: "{{global:REGION}}" 21 | AutomationAssumeRole: 22 | type: String 23 | description: "IAM role for the automation execution" 24 | default: "" 25 | 26 | mainSteps: 27 | - name: getTargetQueues 28 | action: aws:executeScript 29 | inputs: 30 | Runtime: python3.11 31 | Handler: get_queues 32 | Script: | 33 | import boto3 34 | 35 | def get_queues(events, context): 36 | region = events['region'] 37 | tag_key = events['tagKey'] 38 | tag_value = events['tagValue'] 39 | 40 | sqs = boto3.client('sqs', region_name=region) 41 | target_queues = [] 42 | 43 | # Find queues by tag 44 | response = sqs.list_queues() 45 | if 'QueueUrls' not in response: 46 | return [] 47 | 48 | for queue_url in response['QueueUrls']: 49 | try: 50 | tags_response = sqs.list_queue_tags(QueueUrl=queue_url) 51 | tags = tags_response.get('Tags', {}) 52 | 53 | if tags.get(tag_key) == tag_value: 54 | target_queues.append(queue_url) 55 | except Exception as e: 56 | print(f"Error getting tags for queue {queue_url}: {str(e)}") 57 | continue 58 | 59 | return target_queues 60 | InputPayload: 61 | region: "{{ region }}" 62 | tagKey: "{{ tagKey }}" 63 | tagValue: "{{ tagValue }}" 64 | outputs: 65 | - Name: targetQueues 66 | Selector: $.Payload 67 | Type: StringList 68 | description: "Find all SQS queues with the specified tag or use the provided queue URL" 69 | 70 | - name: applyDenyAllPolicyToQueues 71 | action: aws:executeScript 72 | onFailure: "step:removeDenyAllPolicyFromQueues" 73 | onCancel: "step:removeDenyAllPolicyFromQueues" 74 | inputs: 75 | Runtime: python3.11 76 | Handler: apply_deny_policy 77 | Script: | 78 | import boto3 79 | import json 80 | 81 | def apply_deny_policy(events, context): 82 | region = events['region'] 83 | target_queues = events['targetQueues'] 84 | 85 | sqs = boto3.client('sqs', region_name=region) 86 | results = [] 87 | 88 | for queue_url in target_queues: 89 | try: 90 | # Get existing policy 91 | response = sqs.get_queue_attributes( 92 | QueueUrl=queue_url, 93 | AttributeNames=['Policy'] 94 | ) 95 | 96 | existing_policy = {} 97 | if 'Policy' in response.get('Attributes', {}): 98 | existing_policy = json.loads(response['Attributes']['Policy']) 99 | else: 100 | existing_policy = { 101 | "Version": "2012-10-17", 102 | "Statement": [] 103 | } 104 | 105 | # Add deny statement 106 | deny_statement = { 107 | "Effect": "Deny", 108 | "Principal": "*", 109 | "Action": [ 110 | "sqs:DeleteMessage", 111 | "sqs:ChangeMessageVisibility", 112 | "sqs:PurgeQueue", 113 | "sqs:ReceiveMessage", 114 | "sqs:SendMessage" 115 | ], 116 | "Resource": "*", 117 | "Sid": "FISTemporaryDeny" 118 | } 119 | 120 | # Remove any existing statement with the same Sid to avoid duplicates 121 | existing_policy['Statement'] = [s for s in existing_policy.get('Statement', []) 122 | if s.get('Sid') != 'FISTemporaryDeny'] 123 | 124 | existing_policy['Statement'].append(deny_statement) 125 | sqs.set_queue_attributes( 126 | QueueUrl=queue_url, 127 | Attributes={ 128 | 'Policy': json.dumps(existing_policy) 129 | } 130 | ) 131 | 132 | results.append(f"Successfully applied deny policy to {queue_url}") 133 | except Exception as e: 134 | raise RuntimeError(f"Failed to apply SQS policy to {queue_url}: {e}") from e 135 | 136 | return { 137 | 'affectedQueues': target_queues, 138 | 'results': results 139 | } 140 | InputPayload: 141 | region: "{{ region }}" 142 | targetQueues: "{{ getTargetQueues.targetQueues }}" 143 | outputs: 144 | - Name: affectedQueues 145 | Selector: $.Payload.affectedQueues 146 | Type: StringList 147 | - Name: results 148 | Selector: $.Payload.results 149 | Type: StringList 150 | description: "Apply deny-all policy to all target SQS queues" 151 | 152 | - name: waitForDuration 153 | action: "aws:sleep" 154 | onFailure: "step:removeDenyAllPolicyFromQueues" 155 | onCancel: "step:removeDenyAllPolicyFromQueues" 156 | inputs: 157 | Duration: "{{ duration }}" 158 | description: "Wait for the specified duration while the SQS queues are impaired" 159 | 160 | - name: removeDenyAllPolicyFromQueues 161 | action: aws:executeScript 162 | inputs: 163 | Runtime: python3.11 164 | Handler: remove_deny_policy 165 | Script: | 166 | import boto3 167 | 168 | def remove_deny_policy(events, context): 169 | region = events['region'] 170 | affected_queues = events['affectedQueues'] 171 | 172 | sqs = boto3.client('sqs', region_name=region) 173 | results = [] 174 | 175 | for queue_url in affected_queues: 176 | try: 177 | sqs.remove_permission( 178 | QueueUrl=queue_url, 179 | Label='FISTemporaryDeny' 180 | ) 181 | results.append(f"Successfully removed deny policy from {queue_url}") 182 | except Exception as e: 183 | results.append(f"Failed to remove deny policy from {queue_url}: {str(e)}") 184 | 185 | return results 186 | InputPayload: 187 | region: "{{ region }}" 188 | affectedQueues: "{{ applyDenyAllPolicyToQueues.affectedQueues }}" 189 | outputs: 190 | - Name: results 191 | Selector: $.Payload 192 | Type: StringList 193 | description: "Remove the deny permission from all affected SQS queues to restore normal operation" 194 | isEnd: true 195 | -------------------------------------------------------------------------------- /aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-ssm-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2.2", 3 | "description": "Run database load test on Aurora PostgreSQL cluster", 4 | "parameters": { 5 | "DBHost": { 6 | "type": "String", 7 | "description": "Database host endpoint", 8 | "default": "{{resolve:ssm:/aurora-cluster/endpoint}}" 9 | }, 10 | "DBPort": { 11 | "type": "String", 12 | "description": "Database port", 13 | "default": "5432" 14 | }, 15 | "DBName": { 16 | "type": "String", 17 | "description": "Database name", 18 | "default": "mydb" 19 | }, 20 | "DBUser": { 21 | "type": "String", 22 | "description": "Database username", 23 | "default": "postgres" 24 | }, 25 | "DBPassword": { 26 | "type": "String", 27 | "description": "Database password", 28 | "default": "{{resolve:secretsmanager:aurora-cluster-password:SecretString:password}}" 29 | }, 30 | "NumRecords": { 31 | "type": "String", 32 | "description": "Number of records to generate", 33 | "default": "10000" 34 | }, 35 | "Concurrency": { 36 | "type": "String", 37 | "description": "Concurrency level for the load test", 38 | "default": "5" 39 | }, 40 | "Duration": { 41 | "type": "String", 42 | "description": "Duration of the load test in seconds", 43 | "default": "600" 44 | } 45 | }, 46 | "mainSteps": [ 47 | { 48 | "action": "aws:runShellScript", 49 | "name": "installDependencies", 50 | "inputs": { 51 | "runCommand": [ 52 | "#!/bin/bash", 53 | "# Install PostgreSQL client with OS detection", 54 | "if [ -f \"/etc/system-release\" ] && grep -i 'Amazon Linux' /etc/system-release; then", 55 | " if ! grep -Fiq 'VERSION_ID=\"2023\"' /etc/os-release; then", 56 | " # Amazon Linux 2 or earlier", 57 | " sudo yum install -y postgresql", 58 | " elif grep -Fiq 'ID=\"amzn\"' /etc/os-release && grep -Fiq 'VERSION_ID=\"2023\"' /etc/os-release; then", 59 | " # Amazon Linux 2023", 60 | " sudo yum install -y postgresql15", 61 | " fi", 62 | "elif grep -Fiq 'ID=\"centos\"' /etc/os-release || grep -Fiq 'ID=\"rhel\"' /etc/os-release; then", 63 | " # CentOS/RHEL", 64 | " sudo yum install -y postgresql", 65 | "elif grep -Fiq 'ID=ubuntu' /etc/os-release || grep -Fiq 'ID=debian' /etc/os-release; then", 66 | " # Ubuntu/Debian", 67 | " sudo apt-get update && sudo apt-get install -y postgresql-client", 68 | "else", 69 | " echo \"Unsupported OS. Please install PostgreSQL client manually.\"", 70 | " exit 1", 71 | "fi" 72 | ] 73 | } 74 | }, 75 | { 76 | "action": "aws:runShellScript", 77 | "name": "runLoadTest", 78 | "inputs": { 79 | "timeoutSeconds": "900", 80 | "runCommand": [ 81 | "#!/bin/bash", 82 | "# Configuration", 83 | "DB_HOST=\"{{ DBHost }}\"", 84 | "DB_PORT=\"{{ DBPort }}\"", 85 | "DB_NAME=\"{{ DBName }}\"", 86 | "DB_USER=\"{{ DBUser }}\"", 87 | "DB_PASSWORD=\"{{ DBPassword }}\"", 88 | "NUM_RECORDS=\"{{ NumRecords }}\"", 89 | "CONCURRENCY=\"{{ Concurrency }}\"", 90 | "DURATION=\"{{ Duration }}\"", 91 | "", 92 | "# Function to execute SQL", 93 | "execute_sql() {", 94 | " PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c \"$1\"", 95 | "}", 96 | "", 97 | "# Function to execute SQL and return result", 98 | "execute_sql_return() {", 99 | " PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -t -c \"$1\" | tr -d '[:space:]'", 100 | "}", 101 | "", 102 | "# Check connection", 103 | "echo \"Checking connection to PostgreSQL...\"", 104 | "if ! execute_sql \"\\conninfo\"; then", 105 | " echo \"Failed to connect to PostgreSQL. Please check your connection parameters.\"", 106 | " exit 1", 107 | "fi", 108 | "", 109 | "# Create test tables if they don't exist", 110 | "echo \"Setting up test tables...\"", 111 | "execute_sql \"", 112 | "CREATE TABLE IF NOT EXISTS load_test_users (", 113 | " id SERIAL PRIMARY KEY,", 114 | " username VARCHAR(50) NOT NULL,", 115 | " email VARCHAR(100) NOT NULL,", 116 | " created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,", 117 | " status VARCHAR(20) DEFAULT 'active',", 118 | " login_count INTEGER DEFAULT 0", 119 | ");", 120 | "", 121 | "CREATE INDEX IF NOT EXISTS idx_load_test_users_username ON load_test_users(username);", 122 | "CREATE INDEX IF NOT EXISTS idx_load_test_users_email ON load_test_users(email);", 123 | "CREATE INDEX IF NOT EXISTS idx_load_test_users_status ON load_test_users(status);", 124 | "", 125 | "CREATE TABLE IF NOT EXISTS load_test_transactions (", 126 | " id SERIAL PRIMARY KEY,", 127 | " user_id INTEGER REFERENCES load_test_users(id),", 128 | " amount DECIMAL(10,2) NOT NULL,", 129 | " transaction_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,", 130 | " description TEXT,", 131 | " category VARCHAR(50)", 132 | ");", 133 | "", 134 | "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_user_id ON load_test_transactions(user_id);", 135 | "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_date ON load_test_transactions(transaction_date);", 136 | "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_category ON load_test_transactions(category);", 137 | "\"", 138 | "", 139 | "# Function to run CPU-intensive queries", 140 | "run_cpu_intensive_query() {", 141 | " local query=\"", 142 | " WITH RECURSIVE cpu_load AS (", 143 | " SELECT 1 as n, random() as r", 144 | " UNION ALL", 145 | " SELECT n + 1, random() * r", 146 | " FROM cpu_load", 147 | " WHERE n < 1000", 148 | " ),", 149 | " complex_aggregation AS (", 150 | " SELECT ", 151 | " u.id,", 152 | " u.username,", 153 | " COUNT(t.id) * SUM(t.amount) / NULLIF(AVG(t.amount), 0) as complex_metric,", 154 | " STDDEV(t.amount) as amount_stddev,", 155 | " PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY t.amount) as median_amount", 156 | " FROM load_test_users u", 157 | " JOIN load_test_transactions t ON u.id = t.user_id", 158 | " GROUP BY u.id, u.username", 159 | " )", 160 | " SELECT ca.*, cl.r", 161 | " FROM complex_aggregation ca", 162 | " CROSS JOIN cpu_load cl", 163 | " ORDER BY ca.complex_metric DESC, cl.r", 164 | " LIMIT 100;", 165 | " \"", 166 | " while true; do", 167 | " execute_sql \"$query\" > /dev/null 2>&1", 168 | " sleep 0.1", 169 | " done", 170 | "}", 171 | "", 172 | "# Start CPU-intensive load test", 173 | "echo \"Starting CPU-intensive load test with concurrency $CONCURRENCY for $DURATION seconds...\"", 174 | "", 175 | "# Start worker processes", 176 | "pids=()", 177 | "for i in $(seq 1 $CONCURRENCY); do", 178 | " run_cpu_intensive_query &", 179 | " pids+=($!)", 180 | "done", 181 | "", 182 | "echo \"Load test is running with ${#pids[@]} worker processes.\"", 183 | "", 184 | "# Sleep for the specified duration", 185 | "sleep $DURATION", 186 | "", 187 | "# Kill all worker processes", 188 | "echo \"Stopping load test...\"", 189 | "for pid in \"${pids[@]}\"; do", 190 | " kill -9 $pid 2>/dev/null", 191 | "done", 192 | "", 193 | "echo \"CPU load test completed.\"" 194 | ] 195 | } 196 | } 197 | ] 198 | } 199 | -------------------------------------------------------------------------------- /elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-automation.yaml: -------------------------------------------------------------------------------- 1 | description: "Simulate ElastiCache Redis primary node reboot to test application resilience" 2 | schemaVersion: "0.3" 3 | assumeRole: "{{ AutomationAssumeRole }}" 4 | parameters: 5 | tagKey: 6 | type: String 7 | description: "Tag key to identify ElastiCache clusters to target" 8 | default: "FIS-Ready" 9 | tagValue: 10 | type: String 11 | description: "Tag value to identify ElastiCache clusters to target" 12 | default: "True" 13 | region: 14 | type: String 15 | description: "AWS Region of the ElastiCache clusters" 16 | default: "{{global:REGION}}" 17 | AutomationAssumeRole: 18 | type: String 19 | description: "IAM role for the automation execution" 20 | default: "" 21 | 22 | mainSteps: 23 | - name: triggerNodeFailover 24 | action: aws:executeScript 25 | inputs: 26 | Runtime: python3.11 27 | Handler: trigger_failover 28 | Script: | 29 | import boto3 30 | 31 | def trigger_failover(events, context): 32 | region = events["region"] 33 | tag_key = events["tagKey"] 34 | tag_value = events["tagValue"] 35 | 36 | elasticache = boto3.client("elasticache", region_name=region) 37 | results = [] 38 | 39 | # Get replication groups 40 | response = elasticache.describe_replication_groups() 41 | 42 | for rg in response["ReplicationGroups"]: 43 | if rg["Status"] == "available" and rg.get("AutomaticFailover") == "enabled": 44 | rg_id = rg["ReplicationGroupId"] 45 | 46 | try: 47 | # Check tags 48 | account_id = boto3.client("sts").get_caller_identity()["Account"] 49 | arn = "arn:aws:elasticache:{}:{}:replicationgroup:{}".format(region, account_id, rg_id) 50 | tags_response = elasticache.list_tags_for_resource(ResourceName=arn) 51 | tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagList", [])} 52 | 53 | if tags.get(tag_key) == tag_value: 54 | # Find primary cluster 55 | primary_cluster_id = None 56 | for node_group in rg["NodeGroups"]: 57 | for member in node_group["NodeGroupMembers"]: 58 | if member["CurrentRole"] == "primary": 59 | primary_cluster_id = member["CacheClusterId"] 60 | break 61 | if primary_cluster_id: 62 | break 63 | 64 | if primary_cluster_id: 65 | # Reboot primary node 66 | elasticache.reboot_cache_cluster( 67 | CacheClusterId=primary_cluster_id, 68 | CacheNodeIdsToReboot=["0001"] 69 | ) 70 | results.append("SUCCESS: Triggered failover for {} (primary: {})".format(rg_id, primary_cluster_id)) 71 | else: 72 | results.append("ERROR: No primary found for {}".format(rg_id)) 73 | 74 | except Exception as e: 75 | results.append("ERROR: Failed to process {}: {}".format(rg_id, str(e))) 76 | 77 | return results 78 | InputPayload: 79 | region: "{{ region }}" 80 | tagKey: "{{ tagKey }}" 81 | tagValue: "{{ tagValue }}" 82 | outputs: 83 | - Name: results 84 | Selector: $.Payload 85 | Type: StringList 86 | description: "Trigger node failover by rebooting primary nodes" 87 | 88 | - name: monitorPrimaryNodeRecovery 89 | action: aws:executeScript 90 | inputs: 91 | Runtime: python3.11 92 | Handler: monitor_node 93 | Script: | 94 | import boto3 95 | import time 96 | 97 | def monitor_node(events, context): 98 | region = events["region"] 99 | tag_key = events["tagKey"] 100 | tag_value = events["tagValue"] 101 | 102 | elasticache = boto3.client("elasticache", region_name=region) 103 | results = [] 104 | start_time = time.time() 105 | 106 | # Find the primary node that was rebooted 107 | response = elasticache.describe_replication_groups() 108 | primary_cluster_id = None 109 | 110 | for rg in response["ReplicationGroups"]: 111 | if rg["Status"] == "available" and rg.get("AutomaticFailover") == "enabled": 112 | rg_id = rg["ReplicationGroupId"] 113 | 114 | try: 115 | # Check tags 116 | account_id = boto3.client("sts").get_caller_identity()["Account"] 117 | arn = "arn:aws:elasticache:{}:{}:replicationgroup:{}".format(region, account_id, rg_id) 118 | tags_response = elasticache.list_tags_for_resource(ResourceName=arn) 119 | tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagList", [])} 120 | 121 | if tags.get(tag_key) == tag_value: 122 | # Find primary cluster 123 | for node_group in rg["NodeGroups"]: 124 | for member in node_group["NodeGroupMembers"]: 125 | if member["CurrentRole"] == "primary": 126 | primary_cluster_id = member["CacheClusterId"] 127 | break 128 | if primary_cluster_id: 129 | break 130 | except Exception as e: 131 | continue 132 | 133 | if not primary_cluster_id: 134 | return ["ERROR: Could not find primary node to monitor"] 135 | 136 | # Monitor the primary node status 137 | max_wait_time = 600 # 10 minutes 138 | check_interval = 10 # Check every 10 seconds 139 | 140 | for attempt in range(max_wait_time // check_interval): 141 | try: 142 | cluster_response = elasticache.describe_cache_clusters( 143 | CacheClusterId=primary_cluster_id, 144 | ShowCacheNodeInfo=True 145 | ) 146 | 147 | cluster = cluster_response["CacheClusters"][0] 148 | cluster_status = cluster["CacheClusterStatus"] 149 | 150 | if cluster_status == "available": 151 | recovery_time = time.time() - start_time 152 | results.append("SUCCESS: Primary node {} recovered to Available in {:.1f} seconds".format(primary_cluster_id, recovery_time)) 153 | break 154 | else: 155 | elapsed = time.time() - start_time 156 | results.append("MONITORING: Primary node {} status: {} after {:.1f}s".format(primary_cluster_id, cluster_status, elapsed)) 157 | time.sleep(check_interval) 158 | 159 | except Exception as e: 160 | results.append("ERROR: Failed to check primary node {}: {}".format(primary_cluster_id, str(e))) 161 | break 162 | else: 163 | results.append("TIMEOUT: Primary node {} did not recover within {} seconds".format(primary_cluster_id, max_wait_time)) 164 | 165 | return results 166 | InputPayload: 167 | region: "{{ region }}" 168 | tagKey: "{{ tagKey }}" 169 | tagValue: "{{ tagValue }}" 170 | outputs: 171 | - Name: results 172 | Selector: $.Payload 173 | Type: StringList 174 | description: "Monitor primary node recovery until Available status" 175 | isEnd: true 176 | -------------------------------------------------------------------------------- /elasticache-redis-connection-failure/redis-connection-failure-automation.yaml: -------------------------------------------------------------------------------- 1 | description: "Simulate Redis connection failure by modifying ElastiCache security groups" 2 | schemaVersion: "0.3" 3 | assumeRole: "{{ AutomationAssumeRole }}" 4 | parameters: 5 | tagKey: 6 | type: String 7 | description: "Tag key to identify ElastiCache clusters to target" 8 | default: "FIS-Ready" 9 | tagValue: 10 | type: String 11 | description: "Tag value to identify ElastiCache clusters to target" 12 | default: "True" 13 | duration: 14 | type: String 15 | description: "Duration of the connection failure in ISO8601 format" 16 | default: "PT5M" 17 | region: 18 | type: String 19 | description: "AWS Region of the ElastiCache clusters" 20 | default: "{{global:REGION}}" 21 | AutomationAssumeRole: 22 | type: String 23 | description: "IAM role for the automation execution" 24 | default: "" 25 | 26 | mainSteps: 27 | - name: getTargetClusters 28 | action: aws:executeScript 29 | inputs: 30 | Runtime: python3.11 31 | Handler: get_clusters 32 | Script: | 33 | import boto3 34 | 35 | def get_clusters(events, context): 36 | region = events['region'] 37 | tag_key = events['tagKey'] 38 | tag_value = events['tagValue'] 39 | 40 | elasticache = boto3.client('elasticache', region_name=region) 41 | target_clusters = [] 42 | 43 | # Get Redis clusters 44 | paginator = elasticache.get_paginator('describe_cache_clusters') 45 | 46 | for page in paginator.paginate(): 47 | for cluster in page['CacheClusters']: 48 | if cluster['Engine'] == 'redis': 49 | cluster_id = cluster['CacheClusterId'] 50 | 51 | try: 52 | # Get cluster tags 53 | tags_response = elasticache.list_tags_for_resource( 54 | ResourceName=f"arn:aws:elasticache:{region}:{boto3.client('sts').get_caller_identity()['Account']}:cluster:{cluster_id}" 55 | ) 56 | 57 | tags = {tag['Key']: tag['Value'] for tag in tags_response.get('TagList', [])} 58 | 59 | if tags.get(tag_key) == tag_value: 60 | # Get security groups 61 | security_groups = cluster.get('SecurityGroups', []) 62 | if security_groups: 63 | target_clusters.append({ 64 | 'cluster_id': cluster_id, 65 | 'security_groups': [sg['SecurityGroupId'] for sg in security_groups] 66 | }) 67 | 68 | except Exception as e: 69 | print(f"Error processing cluster {cluster_id}: {str(e)}") 70 | continue 71 | 72 | return target_clusters 73 | InputPayload: 74 | region: "{{ region }}" 75 | tagKey: "{{ tagKey }}" 76 | tagValue: "{{ tagValue }}" 77 | outputs: 78 | - Name: targetClusters 79 | Selector: $.Payload 80 | Type: MapList 81 | description: "Find ElastiCache Redis clusters with specified tags" 82 | 83 | - name: disableRedisConnections 84 | action: aws:executeScript 85 | onFailure: "step:restoreRedisConnections" 86 | onCancel: "step:restoreRedisConnections" 87 | inputs: 88 | Runtime: python3.11 89 | Handler: disable_connections 90 | Script: | 91 | import boto3 92 | 93 | def disable_connections(events, context): 94 | region = events['region'] 95 | target_clusters = events['targetClusters'] 96 | 97 | ec2 = boto3.client('ec2', region_name=region) 98 | results = [] 99 | modified_rules = [] 100 | 101 | for cluster_info in target_clusters: 102 | cluster_id = cluster_info['cluster_id'] 103 | security_groups = cluster_info['security_groups'] 104 | 105 | for sg_id in security_groups: 106 | try: 107 | # Get current security group rules 108 | response = ec2.describe_security_groups(GroupIds=[sg_id]) 109 | sg = response['SecurityGroups'][0] 110 | 111 | # Store original inbound rules for Redis port (6379) 112 | redis_rules = [] 113 | for rule in sg['IpPermissions']: 114 | if rule.get('FromPort') == 6379 and rule.get('ToPort') == 6379: 115 | redis_rules.append(rule) 116 | 117 | if redis_rules: 118 | # Remove Redis access rules 119 | ec2.revoke_security_group_ingress( 120 | GroupId=sg_id, 121 | IpPermissions=redis_rules 122 | ) 123 | 124 | modified_rules.append({ 125 | 'security_group_id': sg_id, 126 | 'cluster_id': cluster_id, 127 | 'removed_rules': redis_rules 128 | }) 129 | 130 | results.append(f"Disabled Redis connections for cluster {cluster_id}, SG {sg_id}") 131 | 132 | except Exception as e: 133 | results.append(f"Failed to modify SG {sg_id} for cluster {cluster_id}: {str(e)}") 134 | 135 | return { 136 | 'modifiedRules': modified_rules, 137 | 'results': results 138 | } 139 | InputPayload: 140 | region: "{{ region }}" 141 | targetClusters: "{{ getTargetClusters.targetClusters }}" 142 | outputs: 143 | - Name: modifiedRules 144 | Selector: $.Payload.modifiedRules 145 | Type: MapList 146 | - Name: results 147 | Selector: $.Payload.results 148 | Type: StringList 149 | description: "Disable Redis connections by removing security group rules" 150 | 151 | - name: waitForDuration 152 | action: "aws:sleep" 153 | onFailure: "step:restoreRedisConnections" 154 | onCancel: "step:restoreRedisConnections" 155 | inputs: 156 | Duration: "{{ duration }}" 157 | description: "Wait for the specified duration while Redis connections are blocked" 158 | 159 | - name: restoreRedisConnections 160 | action: aws:executeScript 161 | inputs: 162 | Runtime: python3.11 163 | Handler: restore_connections 164 | Script: | 165 | import boto3 166 | 167 | def restore_connections(events, context): 168 | region = events['region'] 169 | modified_rules = events['modifiedRules'] 170 | 171 | ec2 = boto3.client('ec2', region_name=region) 172 | results = [] 173 | 174 | for rule_info in modified_rules: 175 | sg_id = rule_info['security_group_id'] 176 | cluster_id = rule_info['cluster_id'] 177 | removed_rules = rule_info['removed_rules'] 178 | 179 | try: 180 | # Restore the original rules 181 | if removed_rules: 182 | ec2.authorize_security_group_ingress( 183 | GroupId=sg_id, 184 | IpPermissions=removed_rules 185 | ) 186 | results.append(f"Restored Redis connections for cluster {cluster_id}, SG {sg_id}") 187 | 188 | except Exception as e: 189 | results.append(f"Failed to restore SG {sg_id} for cluster {cluster_id}: {str(e)}") 190 | 191 | return results 192 | InputPayload: 193 | region: "{{ region }}" 194 | modifiedRules: "{{ disableRedisConnections.modifiedRules }}" 195 | outputs: 196 | - Name: results 197 | Selector: $.Payload 198 | Type: StringList 199 | description: "Restore Redis connections by adding back security group rules" 200 | isEnd: true 201 | -------------------------------------------------------------------------------- /ec2-windows-stop-iis/ec2-windows-stop-iis-ssm-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2.2", 3 | "description": "Stop IIS Application Pool for FIS experiment", 4 | "parameters": { 5 | "DurationSeconds": { 6 | "type": "String", 7 | "default": "120", 8 | "description": "Duration of test in seconds.", 9 | "allowedPattern": "([1-9][0-9]{0,4})|(1[0-6][0-9]{4})|(17[0-1][0-9]{3})|(172[0-7][0-9]{2})|(172800)" 10 | }, 11 | "IISAppPoolName": { 12 | "type": "String", 13 | "default": "DefaultAppPool", 14 | "description": "Name of the Windows IIS Application Pool to Stop", 15 | "allowedPattern": "^[a-zA-Z0-9\\-_\\.]{1,50}$" 16 | } 17 | }, 18 | "mainSteps": [ 19 | { 20 | "action": "aws:runPowerShellScript", 21 | "name": "ValidatePrerequisites", 22 | "precondition": { 23 | "StringEquals": [ 24 | "platformType", 25 | "Windows" 26 | ] 27 | }, 28 | "inputs": { 29 | "timeoutSeconds": 60, 30 | "onFailure": "exit", 31 | "runCommand": [ 32 | "function Write-Log {", 33 | " param($Message)", 34 | " $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'", 35 | " Write-Output \"[$timestamp] $Message\"", 36 | "}", 37 | "", 38 | "try {", 39 | " # Check if IIS modules are installed", 40 | " Write-Log \"Checking if IIS modules are installed...\"", 41 | " $iisModule = Get-Module -ListAvailable -Name WebAdministration", 42 | " if (-not $iisModule) {", 43 | " Write-Log \"ERROR: IIS WebAdministration module is not installed\"", 44 | " Exit 1", 45 | " }", 46 | " Write-Log \"IIS WebAdministration module is installed\"", 47 | "", 48 | " # Import the WebAdministration module", 49 | " Import-Module WebAdministration", 50 | "", 51 | " # Check if experiment is already running", 52 | " if (Test-Path -Path 'C:\\temp\\fis_windows_iis_experiment.json') {", 53 | " Write-Log \"ERROR: fis_windows_iis_experiment.json already exists. Exiting.\"", 54 | " Exit 1", 55 | " }", 56 | "", 57 | " # Create temp directory if it doesn't exist", 58 | " if (-not (Test-Path -Path 'C:\\temp')) {", 59 | " Write-Log \"Creating C:\\temp directory\"", 60 | " New-Item -Path 'C:\\temp' -ItemType Directory -Force | Out-Null", 61 | " }", 62 | "", 63 | " # Verify IIS Application Pool exists", 64 | " Write-Log \"Verifying IIS Application Pool: {{IISAppPoolName}}\"", 65 | " $appPool = Get-IISAppPool -Name {{IISAppPoolName}} -ErrorAction SilentlyContinue", 66 | " if (-not $appPool) {", 67 | " Write-Log \"ERROR: Application Pool {{IISAppPoolName}} not found\"", 68 | " Exit 1", 69 | " }", 70 | "", 71 | " # Verify IIS Application Pool is in Running state", 72 | " Write-Log \"Checking if Application Pool {{IISAppPoolName}} is running...\"", 73 | " if ($appPool.State -ne \"Started\") {", 74 | " Write-Log \"ERROR: Application Pool {{IISAppPoolName}} is not in 'Started' state. Current state: $($appPool.State)\"", 75 | " Write-Log \"The experiment requires the application pool to be in 'Started' state to proceed.\"", 76 | " Exit 1", 77 | " }", 78 | " Write-Log \"Application Pool {{IISAppPoolName}} is in 'Started' state. Proceeding with experiment.\"", 79 | "", 80 | " # Store initial state for idempotency", 81 | " $initialState = @{", 82 | " 'AppPoolName' = '{{IISAppPoolName}}'", 83 | " 'InitialState' = $appPool.State", 84 | " 'StartTime' = (Get-Date).ToString('o')", 85 | " 'ExperimentDuration' = {{DurationSeconds}}", 86 | " }", 87 | " $initialState | ConvertTo-Json | Out-File -FilePath 'C:\\temp\\fis_windows_iis_experiment.json'", 88 | " Write-Log \"Prerequisites validated successfully\"", 89 | "}", 90 | "catch {", 91 | " Write-Log \"ERROR during validation: $($_.Exception.Message)\"", 92 | " Exit 1", 93 | "}" 94 | ] 95 | } 96 | }, 97 | { 98 | "action": "aws:runPowerShellScript", 99 | "name": "StopIISAppPool", 100 | "precondition": { 101 | "StringEquals": [ 102 | "platformType", 103 | "Windows" 104 | ] 105 | }, 106 | "inputs": { 107 | "timeoutSeconds": 120, 108 | "onFailure": "exit", 109 | "runCommand": [ 110 | "function Write-Log {", 111 | " param($Message)", 112 | " $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'", 113 | " Write-Output \"[$timestamp] $Message\"", 114 | "}", 115 | "", 116 | "try {", 117 | " # Import the WebAdministration module", 118 | " Import-Module WebAdministration", 119 | "", 120 | " # Load experiment data", 121 | " $experimentData = Get-Content -Path 'C:\\temp\\fis_windows_iis_experiment.json' | ConvertFrom-Json", 122 | " $start_time = Get-Date", 123 | "", 124 | " # Stop the application pool", 125 | " Write-Log \"Stopping IIS Application Pool: {{IISAppPoolName}}\"", 126 | " $appPool = Get-IISAppPool -Name {{IISAppPoolName}}", 127 | " $appPool | Stop-WebAppPool", 128 | "", 129 | " # Verify the app pool is stopped", 130 | " $stoppedPool = Get-IISAppPool -Name {{IISAppPoolName}}", 131 | " if ($stoppedPool.State -ne 'Stopped') {", 132 | " throw \"Failed to stop application pool\"", 133 | " }", 134 | " Write-Log \"Application Pool stopped successfully\"", 135 | "", 136 | " # Wait for the specified duration", 137 | " Write-Log \"Sleeping for {{DurationSeconds}} seconds\"", 138 | " Start-Sleep -Seconds {{DurationSeconds}}", 139 | "}", 140 | "catch {", 141 | " Write-Log \"ERROR during execution: $($_.Exception.Message)\"", 142 | " # Attempt to restore the app pool even if there was an error", 143 | " try {", 144 | " Write-Log \"Attempting to restore IIS Application Pool after error\"", 145 | " Start-WebAppPool -Name {{IISAppPoolName}}", 146 | " }", 147 | " catch {", 148 | " Write-Log \"ERROR during emergency restoration: $($_.Exception.Message)\"", 149 | " }", 150 | " Exit 1", 151 | "}" 152 | ] 153 | } 154 | }, 155 | { 156 | "action": "aws:runPowerShellScript", 157 | "name": "RestoreIISAppPool", 158 | "precondition": { 159 | "StringEquals": [ 160 | "platformType", 161 | "Windows" 162 | ] 163 | }, 164 | "inputs": { 165 | "timeoutSeconds": 120, 166 | "onFailure": "successAndExit", 167 | "runCommand": [ 168 | "function Write-Log {", 169 | " param($Message)", 170 | " $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'", 171 | " Write-Output \"[$timestamp] $Message\"", 172 | "}", 173 | "", 174 | "try {", 175 | " # Import the WebAdministration module", 176 | " Import-Module WebAdministration", 177 | "", 178 | " # Restore the application pool", 179 | " Write-Log \"Restoring IIS Application Pool: {{IISAppPoolName}}\"", 180 | " Start-WebAppPool -Name {{IISAppPoolName}}", 181 | "", 182 | " # Verify the app pool is started", 183 | " $startedPool = Get-IISAppPool -Name {{IISAppPoolName}}", 184 | " if ($startedPool.State -ne 'Started') {", 185 | " throw \"Failed to start application pool\"", 186 | " }", 187 | " Write-Log \"Application Pool restored successfully\"", 188 | "}", 189 | "catch {", 190 | " Write-Log \"ERROR during restoration: $($_.Exception.Message)\"", 191 | " throw", 192 | "}", 193 | "finally {", 194 | " # Cleanup - always remove the experiment file", 195 | " try {", 196 | " Write-Log \"Cleaning up: Deleting JSON file C:\\temp\\fis_windows_iis_experiment.json\"", 197 | " Remove-Item -Path C:\\temp\\fis_windows_iis_experiment.json -Force", 198 | " Write-Log \"JSON file deleted successfully\"", 199 | " }", 200 | " catch {", 201 | " Write-Log \"ERROR during cleanup: $($_.Exception.Message)\"", 202 | " }", 203 | "}" 204 | ] 205 | } 206 | } 207 | ] 208 | } 209 | -------------------------------------------------------------------------------- /dynamodb-region-impairment/dynamodb-region-impairment-automation.yaml: -------------------------------------------------------------------------------- 1 | description: "Block DynamoDB table access by modifying table resource policy" 2 | schemaVersion: "0.3" 3 | assumeRole: "{{ AutomationAssumeRole }}" 4 | parameters: 5 | tableName: 6 | type: String 7 | description: "DynamoDB table name to impair" 8 | default: "my-global-table" 9 | targetRegion: 10 | type: String 11 | description: "AWS Region to block DynamoDB access in" 12 | default: "us-east-1" 13 | duration: 14 | type: String 15 | description: "Duration of the impairment in ISO8601 format" 16 | default: "PT10M" 17 | AutomationAssumeRole: 18 | type: String 19 | description: "IAM role for the automation execution" 20 | default: "" 21 | 22 | mainSteps: 23 | - name: waitForFISAction 24 | action: aws:sleep 25 | inputs: 26 | Duration: PT10S 27 | 28 | - name: applyDenyPolicy 29 | action: aws:executeScript 30 | onFailure: step:cleanupPolicy 31 | onCancel: step:cleanupPolicy 32 | inputs: 33 | Runtime: python3.11 34 | Handler: apply_deny_policy 35 | Script: | 36 | import boto3 37 | import json 38 | import time 39 | 40 | def apply_deny_policy(events, context): 41 | table_name = events['tableName'] 42 | target_region = events['targetRegion'] 43 | 44 | # Create DynamoDB client for target region 45 | dynamodb = boto3.client('dynamodb', region_name=target_region) 46 | 47 | # Get current table description 48 | response = dynamodb.describe_table(TableName=table_name) 49 | table_arn = response['Table']['TableArn'] 50 | 51 | # Check for existing resource policy 52 | existing_policy = None 53 | try: 54 | policy_response = dynamodb.get_resource_policy(ResourceArn=table_arn) 55 | existing_policy = json.loads(policy_response['Policy']) 56 | print(f"Found existing policy on table {table_name}") 57 | except (dynamodb.exceptions.ResourceNotFoundException, dynamodb.exceptions.PolicyNotFoundException): 58 | print(f"No existing policy on table {table_name}") 59 | 60 | # Create deny statement 61 | deny_statement = { 62 | "Sid": "FISDenyAccess", 63 | "Effect": "Deny", 64 | "Principal": "*", 65 | "Action": [ 66 | "dynamodb:PutItem", 67 | "dynamodb:GetItem", 68 | "dynamodb:UpdateItem", 69 | "dynamodb:DeleteItem", 70 | "dynamodb:Query", 71 | "dynamodb:Scan", 72 | "dynamodb:BatchGetItem", 73 | "dynamodb:BatchWriteItem" 74 | ], 75 | "Resource": table_arn, 76 | "Condition": { 77 | "StringNotEquals": { 78 | "aws:PrincipalArn": [ 79 | "arn:aws:iam::*:role/*SSM*", 80 | "arn:aws:iam::*:role/*FIS*", 81 | "arn:aws:iam::*:role/aws-service-role/replication.dynamodb.amazonaws.com/AWSServiceRoleForDynamoDBReplication" 82 | ] 83 | } 84 | } 85 | } 86 | 87 | # Merge with existing policy or create new one 88 | if existing_policy: 89 | new_policy = existing_policy.copy() 90 | new_policy['Statement'].append(deny_statement) 91 | else: 92 | new_policy = { 93 | "Version": "2012-10-17", 94 | "Statement": [deny_statement] 95 | } 96 | 97 | 98 | try: 99 | # Apply the merged resource policy with retry logic 100 | max_retries = 5 101 | for attempt in range(max_retries): 102 | try: 103 | dynamodb.put_resource_policy( 104 | ResourceArn=table_arn, 105 | Policy=json.dumps(new_policy) 106 | ) 107 | break 108 | except dynamodb.exceptions.ResourceInUseException: 109 | if attempt < max_retries - 1: 110 | print(f"Table busy, retrying in {2**attempt} seconds...") 111 | time.sleep(2**attempt) 112 | else: 113 | raise 114 | 115 | print(f"Applied deny policy to table {table_name} in {target_region}") 116 | 117 | return { 118 | "statusCode": 200, 119 | "tableArn": table_arn, 120 | "targetRegion": target_region, 121 | "existingPolicy": existing_policy, 122 | "body": f"Successfully applied deny policy to {table_name}" 123 | } 124 | except Exception as e: 125 | print(f"Error applying policy: {str(e)}") 126 | raise e 127 | InputPayload: 128 | tableName: "{{ tableName }}" 129 | targetRegion: "{{ targetRegion }}" 130 | outputs: 131 | - Name: tableArn 132 | Selector: $.Payload.tableArn 133 | Type: String 134 | - Name: targetRegion 135 | Selector: $.Payload.targetRegion 136 | Type: String 137 | - Name: existingPolicy 138 | Selector: $.Payload.existingPolicy 139 | Type: StringMap 140 | 141 | - name: waitForDuration 142 | action: aws:sleep 143 | inputs: 144 | Duration: "{{ duration }}" 145 | 146 | - name: cleanupPolicy 147 | action: aws:executeScript 148 | inputs: 149 | Runtime: python3.11 150 | Handler: cleanup_policy 151 | Script: | 152 | import boto3 153 | import json 154 | 155 | def cleanup_policy(events, context): 156 | table_arn = events.get('tableArn') 157 | target_region = events.get('targetRegion') 158 | table_name = events.get('tableName') 159 | existing_policy = events.get('existingPolicy') 160 | 161 | # If we don't have tableArn from previous step, try to get it 162 | if not table_arn and table_name and target_region: 163 | dynamodb = boto3.client('dynamodb', region_name=target_region) 164 | response = dynamodb.describe_table(TableName=table_name) 165 | table_arn = response['Table']['TableArn'] 166 | 167 | if table_arn and target_region: 168 | dynamodb = boto3.client('dynamodb', region_name=target_region) 169 | try: 170 | # Get current policy to see what's there now 171 | current_policy_response = dynamodb.get_resource_policy(ResourceArn=table_arn) 172 | current_policy = json.loads(current_policy_response['Policy']) 173 | 174 | # Remove only our FISDenyAccess statement by SID 175 | filtered_statements = [ 176 | stmt for stmt in current_policy.get('Statement', []) 177 | if stmt.get('Sid') != 'FISDenyAccess' 178 | ] 179 | 180 | if filtered_statements: 181 | # Keep other statements, remove only ours 182 | cleaned_policy = { 183 | "Version": current_policy.get("Version", "2012-10-17"), 184 | "Statement": filtered_statements 185 | } 186 | dynamodb.put_resource_policy( 187 | ResourceArn=table_arn, 188 | Policy=json.dumps(cleaned_policy) 189 | ) 190 | print(f"Removed FISDenyAccess statement, preserved other policies on table {table_arn}") 191 | else: 192 | # No other statements, delete entire policy 193 | dynamodb.delete_resource_policy(ResourceArn=table_arn) 194 | print(f"Removed entire policy from table {table_arn}") 195 | 196 | return {"statusCode": 200, "body": "Successfully cleaned up policy"} 197 | except Exception as e: 198 | print(f"Error cleaning up policy: {str(e)}") 199 | # Don't fail if policy doesn't exist 200 | if "ResourceNotFoundException" in str(e) or "PolicyNotFoundException" in str(e): 201 | return {"statusCode": 200, "body": "Policy already removed"} 202 | raise e 203 | else: 204 | print("No table ARN provided for cleanup") 205 | return {"statusCode": 200, "body": "No cleanup needed"} 206 | InputPayload: 207 | tableArn: "{{ applyDenyPolicy.tableArn }}" 208 | targetRegion: "{{ applyDenyPolicy.targetRegion }}" 209 | tableName: "{{ tableName }}" 210 | existingPolicy: "{{ applyDenyPolicy.existingPolicy }}" 211 | --------------------------------------------------------------------------------