├── dynamodb-traffic-blackhole-region-impairment
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── dynamodb-traffic-blackhole-region-impairment-template.json
    ├── dynamodb-traffic-blackhole-region-impairment-iam-policy.json
    └── README.md
├── templates
    ├── AWSFIS.json
    ├── example-iam-trust-relationship.json
    ├── example-iam-policy.json
    └── README.md
├── ec2-windows-stop-iis
    ├── images
    │   └── ssm.png
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── ec2-windows-stop-iis-template.json
    ├── ec2-windows-stop-iis-iam-policy.json
    ├── README.md
    └── ec2-windows-stop-iis-ssm-template.json
├── sqs-queue-impairment
    ├── images
    │   └── sqs.png
    ├── AWSFIS.json
    ├── ssm-iam-trust-relationship.json
    ├── fis-iam-trust-relationship.json
    ├── sqs-queue-impairment-tag-based-ssm-automation-role-iam-policy.json
    ├── sqs-queue-impairment-tag-based-fis-role-iam-policy.json
    ├── sqs-queue-impairment-tag-based-experiment-template.json
    ├── README.md
    └── sqs-queue-impairment-tag-based-automation.yaml
├── cloudfront-impairment
    ├── AWSFIS.json
    ├── images
    │   ├── experiment-workflow.png
    │   └── cloudfront-impairment-architecture.png
    ├── ssm-iam-trust-relationship.json
    ├── fis-iam-trust-relationship.json
    ├── cloudfront-impairment-tag-based-ssm-automation-role-iam-policy.json
    ├── cloudfront-impairment-tag-based-fis-role-iam-policy.json
    └── cloudfront-impairment-tag-based-experiment-template.json
├── ec2-spot-interruption
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── ec2-spot-interruption-template.json
    ├── ec2-spot-interruption-iam-policy.json
    └── README.md
├── aurora-cluster-failover
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── aurora-cluster-failover-template.json
    ├── aurora-cluster-failover-iam-policy.json
    └── README.md
├── dynamodb-region-impairment
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── ssm-iam-trust-relationship.json
    ├── dynamodb-region-impairment-ssm-automation-role-iam-policy.json
    ├── fis-role-policy.json
    ├── dynamodb-region-impairment-fis-role-iam-policy.json
    ├── dynamodb-region-impairment-experiment-template.json
    ├── README.md
    └── dynamodb-region-impairment-automation.yaml
├── ec2-instances-terminate
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── ec2-instances-terminate-iam-policy.json
    ├── ec2-instances-terminate-template.json
    └── README.md
├── sap-ec2-instance-stop-ascs
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── sap-ec2-instance-stop-sap-ascs-template.json
    ├── sap-ec2-instance-stop-sap-policy.json
    └── README.md
├── sap-ebs-pause-database-data
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── sap-ebs-pause-database-data-policy.json
    ├── sap-ebs-pause-database-data-template.json
    └── README.md
├── sap-ec2-instance-stop-database
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── sap-ec2-instance-stop-sap-database-template.json
    ├── sap-ec2-instance-stop-sap-database-policy.json
    └── README.md
├── aurora-global-region-failover
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── ssm-iam-trust-relationship.json
    ├── aurora-global-region-failover-ssm-automation-role-iam-policy.json
    ├── aurora-global-region-failover-fis-role-iam-policy.json
    ├── aurora-global-region-failover-experiment-template.json
    └── README.md
├── mysql-rds-loadtest-failover
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── mysql-rds-loadtest-failover-iam-policy.json
    ├── mysql-rds-loadtest-failover-template.json
    └── README.md
├── aurora-postgres-cluster-loadtest-failover
    ├── AWSFIS.json
    ├── fis-iam-trust-relationship.json
    ├── aurora-postgres-cluster-loadtest-failover-iam-policy.json
    ├── aurora-postgres-cluster-loadtest-failover-template.json
    ├── README.md
    └── aurora-postgres-cluster-loadtest-failover-ssm-template.json
├── .gitignore
├── elasticache-redis-connection-failure
    ├── fis-iam-trust-relationship.json
    ├── ssm-iam-trust-relationship.json
    ├── redis-connection-failure-ssm-role-iam-policy.json
    ├── redis-connection-failure-experiment-template.json
    ├── redis-connection-failure-fis-role-iam-policy.json
    ├── README.md
    └── redis-connection-failure-automation.yaml
├── elasticache-redis-primary-node-failover
    ├── fis-iam-trust-relationship.json
    ├── ssm-iam-trust-relationship.json
    ├── elasticache-redis-primary-node-failover-ssm-role-iam-policy.json
    ├── elasticache-redis-primary-node-failover-experiment-template.json
    ├── elasticache-redis-primary-node-failover-fis-role-iam-policy.json
    ├── elasticache-redis-primary-node-failover-automation.json
    └── README.md
├── elasticache-redis-primary-node-reboot
    ├── fis-iam-trust-relationship.json
    ├── ssm-iam-trust-relationship.json
    ├── elasticache-node-primary-node-reboot-ssm-role-iam-policy.json
    ├── elasticache-redis-primary-node-reboot-experiment-template.json
    ├── elasticache-redis-primary-node-reboot-fis-role-iam-policy.json
    ├── README.md
    └── elasticache-redis-primary-node-reboot-automation.yaml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
└── CONTRIBUTING.md


/dynamodb-traffic-blackhole-region-impairment/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |   "templateVersion": "2020-09-01"
3 | }
4 | 


--------------------------------------------------------------------------------
/templates/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/images/ssm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/ec2-windows-stop-iis/images/ssm.png


--------------------------------------------------------------------------------
/sqs-queue-impairment/images/sqs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/sqs-queue-impairment/images/sqs.png


--------------------------------------------------------------------------------
/cloudfront-impairment/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/ec2-spot-interruption/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/aurora-cluster-failover/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/dynamodb-region-impairment/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/ec2-instances-terminate/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-ascs/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/sqs-queue-impairment/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/sap-ebs-pause-database-data/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-database/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/aurora-global-region-failover/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/mysql-rds-loadtest-failover/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/AWSFIS.json:
--------------------------------------------------------------------------------
1 | {
2 |     "AWSFIS": {
3 |         "template": {
4 |             "version": "1.0"
5 |         }
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/cloudfront-impairment/images/experiment-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/cloudfront-impairment/images/experiment-workflow.png


--------------------------------------------------------------------------------
/cloudfront-impairment/images/cloudfront-impairment-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/fis-template-library/HEAD/cloudfront-impairment/images/cloudfront-impairment-architecture.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.js 
 2 | !jest.config.js
 3 | *.d.ts
 4 | node_modules
 5 | .aws-sam
 6 | .vscode
 7 | samconfig.toml
 8 | .DS_Store
 9 | 
10 | # CDK asset staging directory
11 | .cdk.staging
12 | cdk.out
13 | .idea/
14 | 


--------------------------------------------------------------------------------
/cloudfront-impairment/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "ssm.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "ssm.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "fis.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "ssm.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "fis.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "ssm.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "fis.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "ssm.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "fis.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/dynamodb-traffic-blackhole-region-impairment/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Principal": {
 7 |         "Service": "fis.amazonaws.com"
 8 |       },
 9 |       "Action": "sts:AssumeRole"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/templates/example-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/aurora-cluster-failover/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/ec2-instances-terminate/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/ec2-spot-interruption/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/cloudfront-impairment/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/sap-ebs-pause-database-data/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-ascs/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/sqs-queue-impairment/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "ssm.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/mysql-rds-loadtest-failover/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-database/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/aurora-global-region-failover/fis-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "fis.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/aurora-global-region-failover/ssm-iam-trust-relationship.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Principal": {
 7 |                 "Service": "ssm.amazonaws.com"
 8 |             },
 9 |             "Action": "sts:AssumeRole"
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/aurora-global-region-failover/aurora-global-region-failover-ssm-automation-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "rds:DescribeGlobalClusters",
 8 |                 "rds:FailoverGlobalCluster"
 9 |             ],
10 |             "Resource": "*"
11 |         }
12 |     ]
13 | }
14 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-ssm-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "elasticache:DescribeReplicationGroups",
 8 |         "elasticache:ListTagsForResource",
 9 |         "elasticache:ModifyReplicationGroup"
10 |       ],
11 |       "Resource": "*"
12 |     },
13 |     {
14 |       "Effect": "Allow",
15 |       "Action": [
16 |         "sts:GetCallerIdentity"
17 |       ],
18 |       "Resource": "*"
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/elasticache-node-primary-node-reboot-ssm-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "elasticache:DescribeCacheClusters",
 8 |         "elasticache:DescribeReplicationGroups",
 9 |         "elasticache:ListTagsForResource",
10 |         "elasticache:RebootCacheCluster"
11 |       ],
12 |       "Resource": "*"
13 |     },
14 |     {
15 |       "Effect": "Allow",
16 |       "Action": [
17 |         "sts:GetCallerIdentity"
18 |       ],
19 |       "Resource": "*"
20 |     }
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/sqs-queue-impairment-tag-based-ssm-automation-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "sqs:ListQueues",
 8 |         "sqs:ListQueueTags",
 9 |         "sqs:AddPermission",
10 |         "sqs:RemovePermission",
11 |         "sqs:GetQueueAttributes",
12 |         "sqs:SetQueueAttributes"
13 |       ],
14 |       "Resource": "arn:aws:sqs:<YOUR REGION>:<YOUR AWS ACCOUNT>:*",
15 |       "Condition": {
16 |         "StringEquals": {
17 |           "aws:ResourceTag/FIS-Ready": "True"
18 |         }
19 |       }
20 |     }
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/cloudfront-impairment/cloudfront-impairment-tag-based-ssm-automation-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "cloudfront:ListDistributions",
 8 |         "cloudfront:ListTagsForResource",
 9 |         "cloudfront:GetDistributionConfig",
10 |         "cloudfront:CreateInvalidation"
11 |       ],
12 |       "Resource": "*"
13 |     },
14 |     {
15 |       "Effect": "Allow",
16 |       "Action": [
17 |         "s3:GetBucketPolicy",
18 |         "s3:PutBucketPolicy",
19 |         "s3:DeleteBucketPolicy"
20 |       ],
21 |       "Resource": "arn:aws:s3:::*"
22 |     }
23 |   ]
24 | }
25 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/dynamodb-region-impairment-ssm-automation-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "dynamodb:PutResourcePolicy",
 8 |                 "dynamodb:DeleteResourcePolicy",
 9 |                 "dynamodb:GetResourcePolicy",
10 |                 "dynamodb:DescribeTable"
11 |             ],
12 |             "Resource": "*"
13 |         },
14 |         {
15 |             "Effect": "Allow",
16 |             "Action": [
17 |                 "sts:GetCallerIdentity"
18 |             ],
19 |             "Resource": "*"
20 |         }
21 |     ]
22 | }
23 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/redis-connection-failure-ssm-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "elasticache:DescribeCacheClusters",
 8 |         "elasticache:ListTagsForResource"
 9 |       ],
10 |       "Resource": "*"
11 |     },
12 |     {
13 |       "Effect": "Allow",
14 |       "Action": [
15 |         "ec2:DescribeSecurityGroups",
16 |         "ec2:AuthorizeSecurityGroupIngress",
17 |         "ec2:RevokeSecurityGroupIngress"
18 |       ],
19 |       "Resource": "*"
20 |     },
21 |     {
22 |       "Effect": "Allow",
23 |       "Action": [
24 |         "sts:GetCallerIdentity"
25 |       ],
26 |       "Resource": "*"
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/templates/example-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "logs:CreateLogGroup",
 8 |                 "logs:CreateLogStream",
 9 |                 "logs:PutLogEvents",
10 |                 "logs:DescribeLogGroups",
11 |                 "logs:DescribeLogStreams"
12 |             ],
13 |             "Resource": "*"
14 |         },
15 |         {
16 |             "Effect": "Allow",
17 |             "Action": [
18 |                 "fis:StartExperiment",
19 |                 "fis:GetExperimentSummary",
20 |                 "fis:GetExperimentResults",
21 |                 "fis:StopExperiment"
22 |             ],
23 |             "Resource": "*"
24 |         }
25 |     ]
26 | }


--------------------------------------------------------------------------------
/aurora-global-region-failover/aurora-global-region-failover-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "ssm:StartAutomationExecution",
 8 |                 "ssm:GetAutomationExecution",
 9 |                 "ssm:DescribeAutomationExecutions",
10 |                 "ssm:DescribeAutomationStepExecutions",
11 |                 "ssm:StopAutomationExecution"
12 |             ],
13 |             "Resource": [
14 |                 "arn:aws:ssm:*:*:document/*",
15 |                 "arn:aws:ssm:*:*:automation-execution/*"
16 |             ]
17 |         },
18 |         {
19 |             "Effect": "Allow",
20 |             "Action": [
21 |                 "iam:PassRole"
22 |             ],
23 |             "Resource": "arn:aws:iam::*:role/*SSM*"
24 |         }
25 |     ]
26 | }
27 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/fis-role-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "dynamodb:PauseReplication",
 8 |                 "dynamodb:ResumeReplication",
 9 |                 "dynamodb:DescribeTable",
10 |                 "dynamodb:ListTagsOfResource"
11 |             ],
12 |             "Resource": "*"
13 |         },
14 |         {
15 |             "Effect": "Allow",
16 |             "Action": [
17 |                 "ssm:StartAutomationExecution",
18 |                 "ssm:GetAutomationExecution",
19 |                 "ssm:StopAutomationExecution"
20 |             ],
21 |             "Resource": "*"
22 |         },
23 |         {
24 |             "Effect": "Allow",
25 |             "Action": [
26 |                 "iam:PassRole"
27 |             ],
28 |             "Resource": "arn:aws:iam::*:role/*SSM*"
29 |         }
30 |     ]
31 | }
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 


--------------------------------------------------------------------------------
/ec2-spot-interruption/ec2-spot-interruption-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "interrupt-ec2-spot",
 3 |     "targets": {
 4 |         "spot-instances": {
 5 |             "resourceType": "aws:ec2:spot-instance",
 6 |             "resourceTags": {
 7 |                 "FIS-Ready": "True"
 8 |             },
 9 |             "selectionMode": "ALL"
10 |         }
11 |     },
12 |     "actions": {
13 |         "interrupt-ec2-spot": {
14 |             "actionId": "aws:ec2:send-spot-instance-interruptions",
15 |             "parameters": {
16 |                 "durationBeforeInterruption": "PT2M"
17 |             },
18 |             "targets": {
19 |                 "SpotInstances": "spot-instances"
20 |             }
21 |         }
22 |     },
23 |     "stopConditions": [
24 |         {
25 |             "source": "none"
26 |         }
27 |     ],
28 |     "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
29 |     "tags": {
30 |         "Name": "interrupt-ec2-spot"
31 |     },
32 |     "experimentOptions": {
33 |         "accountTargeting": "single-account",
34 |         "emptyTargetResolutionMode": "fail"
35 |     }
36 | }


--------------------------------------------------------------------------------
/ec2-instances-terminate/ec2-instances-terminate-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "fis:StartExperiment",
 8 |                 "fis:GetExperimentSummary",
 9 |                 "fis:GetExperimentResults",
10 |                 "fis:StopExperiment"
11 |             ],
12 |             "Resource": "*"
13 |         },
14 |         {
15 |             "Effect": "Allow",
16 |             "Action": "ec2:TerminateInstances",
17 |             "Resource": "arn:aws:ec2:*:*:instance/*",
18 |             "Condition": {
19 |                 "StringEquals": {
20 |                     "aws:ResourceTag/FIS-Ready": "True"
21 |                 }
22 |             }
23 |         },
24 |         {
25 |             "Effect": "Allow",
26 |             "Action": [
27 |                 "logs:CreateLogGroup",
28 |                 "logs:CreateLogStream",
29 |                 "logs:PutLogEvents",
30 |                 "logs:DescribeLogGroups",
31 |                 "logs:DescribeLogStreams"
32 |             ],
33 |             "Resource": "*"
34 |         }
35 |     ]
36 | }


--------------------------------------------------------------------------------
/aurora-global-region-failover/aurora-global-region-failover-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "Aurora Global Database regional failover experiment using SSM automation to test disaster recovery procedures and measure RTO/RPO",
 3 |     "targets": {},
 4 |     "actions": {
 5 |         "aurora-global-failover": {
 6 |             "actionId": "aws:ssm:start-automation-execution",
 7 |             "parameters": {
 8 |                 "documentArn": "arn:aws:ssm:<REGION>:<ACCOUNT>:document/aurora-global-region-failover-automation",
 9 |                 "documentParameters": "{\"globalClusterIdentifier\": \"<GLOBAL-CLUSTER-ID>\", \"failoverType\": \"switchover\", \"AutomationAssumeRole\": \"arn:aws:iam::<ACCOUNT>:role/<SSM-AUTOMATION-ROLE>\"}",
10 |                 "maxDuration": "PT10M"
11 |             }
12 |         }
13 |     },
14 |     "stopConditions": [
15 |         {
16 |             "source": "none"
17 |         }
18 |     ],
19 |     "roleArn": "arn:aws:iam::<ACCOUNT>:role/<FIS-ROLE>",
20 |     "tags": {
21 |         "Name": "aurora-global-region-failover"
22 |     },
23 |     "experimentOptions": {
24 |         "accountTargeting": "single-account",
25 |         "emptyTargetResolutionMode": "fail"
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/dynamodb-region-impairment-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "dynamodb:PauseReplication",
 8 |                 "dynamodb:ResumeReplication",
 9 |                 "dynamodb:PutResourcePolicy",
10 |                 "dynamodb:DeleteResourcePolicy",
11 |                 "dynamodb:GetResourcePolicy",
12 |                 "dynamodb:DescribeTable",
13 |                 "dynamodb:ListTagsOfResource",
14 |                 "dynamodb:ListTables",
15 |                 "tag:GetResources"
16 |             ],
17 |             "Resource": "*"
18 |         },
19 |         {
20 |             "Effect": "Allow",
21 |             "Action": [
22 |                 "ssm:StartAutomationExecution",
23 |                 "ssm:GetAutomationExecution",
24 |                 "ssm:StopAutomationExecution"
25 |             ],
26 |             "Resource": "*"
27 |         },
28 |         {
29 |             "Effect": "Allow",
30 |             "Action": [
31 |                 "iam:PassRole"
32 |             ],
33 |             "Resource": "arn:aws:iam::*:role/*SSM*"
34 |         }
35 |     ]
36 | }
37 | 


--------------------------------------------------------------------------------
/ec2-instances-terminate/ec2-instances-terminate-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "description": "ec2-instance-terminate",
 3 |         "targets": {
 4 |                 "Instances-Target-1": {
 5 |                         "resourceType": "aws:ec2:instance",
 6 |                         "resourceTags": {
 7 |                                 "FIS-Ready": "True"
 8 |                         },
 9 |                         "selectionMode": "PERCENT(25)"
10 |                 }
11 |         },
12 |         "actions": {
13 |                 "ec2-instances-terminate": {
14 |                         "actionId": "aws:ec2:terminate-instances",
15 |                         "parameters": {},
16 |                         "targets": {
17 |                                 "Instances": "Instances-Target-1"
18 |                         }
19 |                 }
20 |         },
21 |         "stopConditions": [
22 |                 {
23 |                         "source": "none"
24 |                 }
25 |         ],
26 |         "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
27 |         "tags": {},
28 |         "experimentOptions": {
29 |                 "accountTargeting": "single-account",
30 |                 "emptyTargetResolutionMode": "fail"
31 |         }
32 | }


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/redis-connection-failure-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Simulate Redis connection failure to test client circuit breaker behavior",
 3 |   "targets": {},
 4 |   "actions": {
 5 |     "disableRedisConnections": {
 6 |       "actionId": "aws:ssm:start-automation-execution",
 7 |       "description": "Disable Redis connections for 5 minutes to test resilience",
 8 |       "parameters": {
 9 |         "maxDuration": "PT30M",
10 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
11 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT5M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
12 |       },
13 |       "targets": {}
14 |     }
15 |   },
16 |   "stopConditions": [
17 |     {
18 |       "source": "none"
19 |     }
20 |   ],
21 |   "roleArn": "arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR FIS IAM ROLE NAME>",
22 |   "tags": {
23 |     "Name": "RedisConnectionFailureTest",
24 |     "Purpose": "resilience-testing"
25 |   },
26 |   "experimentOptions": {
27 |     "accountTargeting": "single-account",
28 |     "emptyTargetResolutionMode": "skip"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/cloudfront-impairment/cloudfront-impairment-tag-based-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "AllowFISExperimentLogging",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "logs:CreateLogDelivery",
 9 |         "logs:PutResourcePolicy",
10 |         "logs:DescribeResourcePolicies",
11 |         "logs:DescribeLogGroups"
12 |       ],
13 |       "Resource": "*"
14 |     },
15 |     {
16 |       "Sid": "AllowSSMDocumentExecution",
17 |       "Effect": "Allow",
18 |       "Action": [
19 |         "ssm:StartAutomationExecution",
20 |         "ssm:GetAutomationExecution",
21 |         "ssm:StopAutomationExecution"
22 |       ],
23 |       "Resource": [
24 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/<AUTOMATION DOCUMENT NAME>",
25 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:automation-definition/<AUTOMATION DOCUMENT NAME>:*",
26 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:automation-execution/*"
27 |       ]
28 |     },
29 |     {
30 |       "Sid": "AllowPassRole",
31 |       "Effect": "Allow",
32 |       "Action": ["iam:PassRole"],
33 |       "Resource": ["arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR SSM AUTOMATION ROLE>"]
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/sqs-queue-impairment-tag-based-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "AllowFISExperimentLogging",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "logs:CreateLogDelivery",
 9 |         "logs:PutResourcePolicy",
10 |         "logs:DescribeResourcePolicies",
11 |         "logs:DescribeLogGroups"
12 |       ],
13 |       "Resource": "*"
14 |     },
15 |     {
16 |       "Sid": "AllowSSMDocumentExecution",
17 |       "Effect": "Allow",
18 |       "Action": [
19 |         "ssm:StartAutomationExecution",
20 |         "ssm:GetAutomationExecution",
21 |         "ssm:StopAutomationExecution"
22 |       ],
23 |       "Resource": [
24 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/<AUTOMATION DOCUMENT NAME>",
25 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:automation-definition/S<AUTOMATION DOCUMENT NAME>:*",
26 |         "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:automation-execution/*"
27 |       ]
28 |     },
29 |     {
30 |       "Sid": "AllowPassRole",
31 |       "Effect": "Allow",
32 |       "Action": ["iam:PassRole"],
33 |       "Resource": ["arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR SSM AUTOMATION ROLE>"]
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/cloudfront-impairment/cloudfront-impairment-tag-based-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Simulate CloudFront distribution impairment with specific tag by disabling for 10 minutes",
 3 |   "targets": {},
 4 |   "actions": {
 5 |     "impairCloudFront": {
 6 |       "actionId": "aws:ssm:start-automation-execution",
 7 |       "description": "Simulate CloudFront distribution impairment by disabling for 10 minutes",
 8 |       "parameters": {
 9 |         "maxDuration": "PT1H",
10 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
11 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT10M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
12 |       },
13 |       "targets": {}
14 |     }
15 |   },
16 |   "stopConditions": [
17 |     {
18 |       "source": "none"
19 |     }
20 |   ],
21 |   "roleArn": "arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR FIS IAM ROLE NAME>",
22 |   "tags": {
23 |     "Name": "SimulateCloudFrontImpairment",
24 |     "Purpose": "resilience-testing"
25 |   },
26 |   "experimentOptions": {
27 |     "accountTargeting": "single-account",
28 |     "emptyTargetResolutionMode": "skip"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/redis-connection-failure-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "AllowFISExperimentLogging",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "logs:CreateLogDelivery",
 9 |         "logs:PutResourcePolicy",
10 |         "logs:DescribeResourcePolicies",
11 |         "logs:DescribeLogGroups"
12 |       ],
13 |       "Resource": "*"
14 |     },
15 |     {
16 |       "Sid": "AllowSSMDocumentExecution",
17 |       "Effect": "Allow",
18 |       "Action": [
19 |         "ssm:StartAutomationExecution",
20 |         "ssm:GetAutomationExecution",
21 |         "ssm:StopAutomationExecution"
22 |       ],
23 |       "Resource": [
24 |         "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
25 |         "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:automation-definition/<YOUR AUTOMATION DOCUMENT NAME>:*",
26 |         "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:automation-execution/*"
27 |       ]
28 |     },
29 |     {
30 |       "Sid": "AllowPassRole",
31 |       "Effect": "Allow",
32 |       "Action": ["iam:PassRole"],
33 |       "Resource": ["arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>"]
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Simulate ElastiCache Redis primary node reboot to test application resilience",
 3 |   "targets": {},
 4 |   "actions": {
 5 |     "triggerPrimaryNodeReboot": {
 6 |       "actionId": "aws:ssm:start-automation-execution",
 7 |       "description": "Reboot Redis primary node and monitor recovery",
 8 |       "parameters": {
 9 |         "maxDuration": "PT30M",
10 |         "documentArn": "arn:aws:ssm:us-east-1:<YOUR-ACCOUNT-ID>:document/ElastiCache-Redis-Primary-Node-Reboot",
11 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"region\": \"us-east-1\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR-ACCOUNT-ID>:role/ElastiCache-SSM-Automation-Role\"}"
12 |       },
13 |       "targets": {}
14 |     }
15 |   },
16 |   "stopConditions": [
17 |     {
18 |       "source": "none"
19 |     }
20 |   ],
21 |   "roleArn": "arn:aws:iam::<YOUR-ACCOUNT-ID>:role/ElastiCache-FIS-Role",
22 |   "tags": {
23 |     "Name": "ElastiCacheRedisPrimaryNodeRebootTest",
24 |     "Purpose": "resilience-testing"
25 |   },
26 |   "experimentOptions": {
27 |     "accountTargeting": "single-account",
28 |     "emptyTargetResolutionMode": "skip"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/aurora-cluster-failover/aurora-cluster-failover-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "description": "aurora-cluster-failover",
 3 |         "targets": {
 4 |                 "Clusters-Target-1": {
 5 |                         "resourceType": "aws:rds:cluster",
 6 |                         "resourceTags": {
 7 |                                 "FIS-Ready": "True"
 8 |                         },
 9 |                         "selectionMode": "ALL",
10 |                         "parameters": {}
11 |                 }
12 |         },
13 |         "actions": {
14 |                 "failover-aurora-cluster": {
15 |                         "actionId": "aws:rds:failover-db-cluster",
16 |                         "parameters": {},
17 |                         "targets": {
18 |                                 "Clusters": "Clusters-Target-1"
19 |                         }
20 |                 }
21 |         },
22 |         "stopConditions": [
23 |                 {
24 |                         "source": "none"
25 |                 }
26 |         ],
27 |         "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
28 |         "tags": {},
29 |         "experimentOptions": {
30 |                 "accountTargeting": "single-account",
31 |                 "emptyTargetResolutionMode": "fail"
32 |         }
33 | }


--------------------------------------------------------------------------------
/ec2-spot-interruption/ec2-spot-interruption-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "ec2:SendSpotInstanceInterruptions",
 8 |                 "ec2:DescribeSpotInstanceRequests",
 9 |                 "ec2:DescribeInstances"
10 |             ],
11 |             "Resource": "*",
12 |             "Condition": {
13 |                 "StringEquals": {
14 |                     "aws:ResourceTag/FIS-Ready": "True"
15 |                 }
16 |             }
17 |         },
18 |         {
19 |             "Effect": "Allow",
20 |             "Action": [
21 |                 "logs:CreateLogGroup",
22 |                 "logs:CreateLogStream",
23 |                 "logs:PutLogEvents",
24 |                 "logs:DescribeLogGroups",
25 |                 "logs:DescribeLogStreams"
26 |             ],
27 |             "Resource": "*"
28 |         },
29 |         {
30 |             "Effect": "Allow",
31 |             "Action": [
32 |                 "fis:StartExperiment",
33 |                 "fis:GetExperimentSummary",
34 |                 "fis:GetExperimentResults",
35 |                 "fis:StopExperiment"
36 |             ],
37 |             "Resource": "*"
38 |         }
39 |     ]
40 | }


--------------------------------------------------------------------------------
/aurora-cluster-failover/aurora-cluster-failover-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "rds:FailoverDBCluster",
 8 |                 "rds:DescribeDBClusters"
 9 |             ],
10 |             "Resource": [
11 |                 "arn:aws:rds:*:*:cluster:*"
12 |             ],
13 |             "Condition": {
14 |                 "StringEquals": {
15 |                     "aws:ResourceTag/FIS-Ready": "True"
16 |                 }
17 |             }
18 |         },
19 |         {
20 |             "Effect": "Allow",
21 |             "Action": [
22 |                 "logs:CreateLogGroup",
23 |                 "logs:CreateLogStream",
24 |                 "logs:PutLogEvents",
25 |                 "logs:DescribeLogGroups",
26 |                 "logs:DescribeLogStreams"
27 |             ],
28 |             "Resource": "*"
29 |         },
30 |         {
31 |             "Effect": "Allow",
32 |             "Action": [
33 |                 "fis:StartExperiment",
34 |                 "fis:GetExperimentSummary",
35 |                 "fis:GetExperimentResults",
36 |                 "fis:StopExperiment"
37 |             ],
38 |             "Resource": "*"
39 |         }
40 |     ]
41 | }


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/ec2-windows-stop-iis-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "This Experiment Stops IIS on target Windows Instances",
 3 |     "targets": {
 4 |         "IISServers": {
 5 |             "resourceType": "aws:ec2:instance",
 6 |             "resourceTags": {
 7 |                 "FIS-Ready": "True"
 8 |             },
 9 |             "selectionMode": "ALL"
10 |         }
11 |     },
12 |     "actions": {
13 |         "StopIIS": {
14 |             "actionId": "aws:ssm:send-command",
15 |             "parameters": {
16 |                 "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/StopIISAppPool",
17 |                 "documentParameters": "{\"DurationSeconds\": \"285\", \"IISAppPoolName\": \"DefaultAppPool\"}",
18 |                 "duration": "PT5M"
19 |             },
20 |             "targets": {
21 |                 "Instances": "IISServers"
22 |             }
23 |         }
24 |     },
25 |     "stopConditions": [
26 |         {
27 |             "source": "none"
28 |         }
29 |     ],
30 |     "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
31 |     "tags": {
32 |         "Name": "StopIISExperiment"
33 |     },
34 |     "experimentOptions": {
35 |         "accountTargeting": "single-account",
36 |         "emptyTargetResolutionMode": "fail"
37 |     }
38 | }


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Force ElastiCache Redis primary node failover to test automatic failover mechanisms",
 3 |   "targets": {},
 4 |   "actions": {
 5 |     "triggerPrimaryFailover": {
 6 |       "actionId": "aws:ssm:start-automation-execution",
 7 |       "description": "Force Redis primary node failover and monitor completion",
 8 |       "parameters": {
 9 |         "maxDuration": "PT30M",
10 |         "documentArn": "arn:aws:ssm:{{ aws:region }}:<YOUR-ACCOUNT-ID>:document/ElastiCache-Redis-Primary-Node-Failover",
11 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"region\": \"{{ aws:region }}\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR-ACCOUNT-ID>:role/ElastiCache-Failover-SSM-Role\"}"
12 |       },
13 |       "targets": {}
14 |     }
15 |   },
16 |   "stopConditions": [
17 |     {
18 |       "source": "none"
19 |     }
20 |   ],
21 |   "roleArn": "arn:aws:iam::<YOUR-ACCOUNT-ID>:role/ElastiCache-Failover-FIS-Role",
22 |   "tags": {
23 |     "Name": "ElastiCacheRedisPrimaryNodeFailoverTest",
24 |     "Purpose": "resilience-testing"
25 |   },
26 |   "experimentOptions": {
27 |     "accountTargeting": "single-account",
28 |     "emptyTargetResolutionMode": "skip"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/ec2-windows-stop-iis-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "fis:StartExperiment",
 8 |                 "fis:GetExperimentSummary",
 9 |                 "fis:GetExperimentResults",
10 |                 "fis:StopExperiment"
11 |             ],
12 |             "Resource": "*"
13 |         },
14 |         {
15 |             "Effect":"Allow",
16 |             "Action":[
17 |                "ssm:SendCommand"
18 |             ],
19 |             "Resource":[
20 |                "arn:aws:ssm:*:*:document/*"
21 |             ]
22 |          },
23 |         {
24 |             "Effect": "Allow",
25 |             "Action": [
26 |                 "ssm:SendCommand"
27 |             ],
28 |             "Resource": [
29 |                 "arn:aws:ec2:*:*:instance/*"
30 |             ],
31 |             "Condition": {
32 |                 "StringEquals": {
33 |                     "aws:ResourceTag/FIS-Ready": "True"
34 |                 }
35 |             }
36 |         },
37 |         {
38 |             "Effect": "Allow",
39 |             "Action": [
40 |                 "logs:CreateLogGroup",
41 |                 "logs:CreateLogStream",
42 |                 "logs:PutLogEvents",
43 |                 "logs:DescribeLogGroups",
44 |                 "logs:DescribeLogStreams"
45 |             ],
46 |             "Resource": "*"
47 |         }
48 |     ]
49 | }


--------------------------------------------------------------------------------
/sap-ebs-pause-database-data/sap-ebs-pause-database-data-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "Version": "2012-10-17",
 3 |         "Statement": [
 4 |                 {
 5 |                         "Effect": "Allow",
 6 |                         "Action": [
 7 |                                 "ec2:DescribeVolumes"
 8 |                         ],
 9 |                         "Resource": "*"
10 |                 },
11 |                 {
12 |                         "Effect": "Allow",
13 |                         "Action": [
14 |                                 "ec2:PauseVolumeIO"
15 |                         ],
16 |                         "Resource": "*"
17 |                 },
18 |                 {
19 |                     "Effect": "Allow",
20 |                     "Action": [
21 |                         "logs:CreateLogGroup",
22 |                         "logs:CreateLogStream",
23 |                         "logs:PutLogEvents",
24 |                         "logs:DescribeLogGroups",
25 |                         "logs:DescribeLogStreams"
26 |                     ],
27 |                     "Resource": "*"
28 |                 },
29 |                 {
30 |                     "Effect": "Allow",
31 |                     "Action": [
32 |                         "fis:StartExperiment",
33 |                         "fis:GetExperimentSummary",
34 |                         "fis:GetExperimentResults",
35 |                         "fis:StopExperiment"
36 |                     ],
37 |                     "Resource": "*"
38 |                 }
39 |         ]
40 | }


--------------------------------------------------------------------------------
/dynamodb-traffic-blackhole-region-impairment/dynamodb-traffic-blackhole-region-impairment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Block DynamoDB traffic at the network level and pause global table replication to simulate complete regional DynamoDB failure",
 3 |   "targets": {
 4 |     "subnets": {
 5 |       "resourceType": "aws:ec2:subnet",
 6 |       "resourceTags": {
 7 |         "FIS-Ready": "True"
 8 |       },
 9 |       "selectionMode": "ALL"
10 |     },
11 |     "dynamodbTables": {
12 |       "resourceType": "aws:dynamodb:global-table",
13 |       "resourceTags": {
14 |         "FIS-Ready": "True"
15 |       },
16 |       "selectionMode": "ALL"
17 |     }
18 |   },
19 |   "actions": {
20 |     "blockDynamoDBTraffic": {
21 |       "actionId": "aws:network:disrupt-connectivity",
22 |       "description": "Block DynamoDB traffic from target subnets",
23 |       "parameters": {
24 |         "scope": "dynamodb",
25 |         "duration": "PT10M"
26 |       },
27 |       "targets": {
28 |         "Subnets": "subnets"
29 |       }
30 |     },
31 |     "pauseReplication": {
32 |       "actionId": "aws:dynamodb:global-table-pause-replication",
33 |       "description": "Pause DynamoDB global table replication",
34 |       "parameters": {
35 |         "duration": "PT10M"
36 |       },
37 |       "targets": {
38 |         "Tables": "dynamodbTables"
39 |       }
40 |     }
41 |   },
42 |   "stopConditions": [
43 |     {
44 |       "source": "none"
45 |     }
46 |   ],
47 |   "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/FIS-DynamoDB-Traffic-Blackhole-Role",
48 |   "experimentOptions": {
49 |     "emptyTargetResolutionMode": "skip"
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/dynamodb-traffic-blackhole-region-impairment/dynamodb-traffic-blackhole-region-impairment-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "ec2:CreateNetworkAcl",
 8 |         "ec2:CreateNetworkAclEntry",
 9 |         "ec2:CreateTags",
10 |         "ec2:DeleteNetworkAcl",
11 |         "ec2:DescribeNetworkAcls",
12 |         "ec2:DescribeSubnets",
13 |         "ec2:DescribeVpcs",
14 |         "ec2:ReplaceNetworkAclAssociation"
15 |       ],
16 |       "Resource": "*"
17 |     },
18 |     {
19 |       "Effect": "Allow",
20 |       "Action": [
21 |         "ec2:CreateNetworkAclEntry"
22 |       ],
23 |       "Resource": "*",
24 |       "Condition": {
25 |         "StringEquals": {
26 |           "ec2:ResourceTag/managedByFIS": "true"
27 |         }
28 |       }
29 |     },
30 |     {
31 |       "Effect": "Allow",
32 |       "Action": [
33 |         "ec2:DeleteNetworkAcl"
34 |       ],
35 |       "Resource": "*",
36 |       "Condition": {
37 |         "StringEquals": {
38 |           "ec2:ResourceTag/managedByFIS": "true"
39 |         }
40 |       }
41 |     },
42 |     {
43 |       "Effect": "Allow",
44 |       "Action": [
45 |         "dynamodb:PauseReplication",
46 |         "dynamodb:ResumeReplication",
47 |         "dynamodb:PutResourcePolicy",
48 |         "dynamodb:DeleteResourcePolicy",
49 |         "dynamodb:GetResourcePolicy",
50 |         "dynamodb:DescribeTable",
51 |         "dynamodb:ListTagsOfResource",
52 |         "dynamodb:ListTables",
53 |         "tag:GetResources"
54 |       ],
55 |       "Resource": "*"
56 |     }
57 |   ]
58 | }
59 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/dynamodb-region-impairment-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Block DynamoDB access in a specific region to test global table failover and application resilience",
 3 |   "targets": {
 4 |     "dynamodbTables": {
 5 |       "resourceType": "aws:dynamodb:global-table",
 6 |       "resourceTags": {
 7 |         "FIS-Ready": "True"
 8 |       },
 9 |       "selectionMode": "ALL"
10 |     }
11 |   },
12 |   "actions": {
13 |     "pauseReplication": {
14 |       "actionId": "aws:dynamodb:global-table-pause-replication",
15 |       "description": "Pause DynamoDB global table replication in <YOUR REGION>",
16 |       "parameters": {
17 |         "duration": "PT12M"
18 |       },
19 |       "targets": {
20 |         "Tables": "dynamodbTables"
21 |       }
22 |     },
23 |     "blockDynamoDBAccess": {
24 |       "actionId": "aws:ssm:start-automation-execution",
25 |       "description": "Block DynamoDB access in target region for 10 minutes",
26 |       "parameters": {
27 |         "maxDuration": "PT15M",
28 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/DynamoDB-Region-Impairment",
29 |         "documentParameters": "{\"tableName\": \"<YOUR TABLE NAME>\", \"targetRegion\": \"<YOUR REGION>\", \"duration\": \"PT10M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR AWS ACCOUNT>:role/SSM-DynamoDB-Automation-Role\"}"
30 |       },
31 |       "targets": {}
32 |     }
33 |   },
34 |   "stopConditions": [
35 |     {
36 |       "source": "none"
37 |     }
38 |   ],
39 |   "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/FIS-DynamoDB-Role",
40 |   "experimentOptions": {
41 |     "emptyTargetResolutionMode": "skip"
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "AllowFISExperimentLogging",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "logs:CreateLogDelivery",
 9 |         "logs:PutResourcePolicy",
10 |         "logs:DescribeResourcePolicies",
11 |         "logs:DescribeLogGroups"
12 |       ],
13 |       "Resource": "*"
14 |     },
15 |     {
16 |       "Sid": "AllowSSMDocumentExecution",
17 |       "Effect": "Allow",
18 |       "Action": [
19 |         "ssm:StartAutomationExecution",
20 |         "ssm:GetAutomationExecution",
21 |         "ssm:StopAutomationExecution"
22 |       ],
23 |       "Resource": [
24 |         "arn:aws:ssm:*:*:document/ElastiCache-Redis-Primary-Node-Reboot",
25 |         "arn:aws:ssm:*:*:automation-definition/ElastiCache-Redis-Primary-Node-Reboot:*",
26 |         "arn:aws:ssm:*:*:automation-execution/*"
27 |       ]
28 |     },
29 |     {
30 |       "Sid": "AllowElastiCacheOperations",
31 |       "Effect": "Allow",
32 |       "Action": [
33 |         "elasticache:DescribeReplicationGroups",
34 |         "elasticache:ListTagsForResource",
35 |         "elasticache:RebootCacheCluster"
36 |       ],
37 |       "Resource": "*"
38 |     },
39 |     {
40 |       "Sid": "AllowSTSOperations",
41 |       "Effect": "Allow",
42 |       "Action": [
43 |         "sts:GetCallerIdentity"
44 |       ],
45 |       "Resource": "*"
46 |     },
47 |     {
48 |       "Sid": "AllowPassRole",
49 |       "Effect": "Allow",
50 |       "Action": ["iam:PassRole"],
51 |       "Resource": ["arn:aws:iam::*:role/*"]
52 |     }
53 |   ]
54 | }
55 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-fis-role-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "AllowFISExperimentLogging",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "logs:CreateLogDelivery",
 9 |         "logs:PutResourcePolicy",
10 |         "logs:DescribeResourcePolicies",
11 |         "logs:DescribeLogGroups"
12 |       ],
13 |       "Resource": "*"
14 |     },
15 |     {
16 |       "Sid": "AllowSSMDocumentExecution",
17 |       "Effect": "Allow",
18 |       "Action": [
19 |         "ssm:StartAutomationExecution",
20 |         "ssm:GetAutomationExecution",
21 |         "ssm:StopAutomationExecution"
22 |       ],
23 |       "Resource": [
24 |         "arn:aws:ssm:*:*:document/ElastiCache-Redis-Primary-Node-Failover",
25 |         "arn:aws:ssm:*:*:automation-definition/ElastiCache-Redis-Primary-Node-Failover:*",
26 |         "arn:aws:ssm:*:*:automation-execution/*"
27 |       ]
28 |     },
29 |     {
30 |       "Sid": "AllowElastiCacheOperations",
31 |       "Effect": "Allow",
32 |       "Action": [
33 |         "elasticache:DescribeReplicationGroups",
34 |         "elasticache:ListTagsForResource",
35 |         "elasticache:ModifyReplicationGroup"
36 |       ],
37 |       "Resource": "*"
38 |     },
39 |     {
40 |       "Sid": "AllowSTSOperations",
41 |       "Effect": "Allow",
42 |       "Action": [
43 |         "sts:GetCallerIdentity"
44 |       ],
45 |       "Resource": "*"
46 |     },
47 |     {
48 |       "Sid": "AllowPassRole",
49 |       "Effect": "Allow",
50 |       "Action": ["iam:PassRole"],
51 |       "Resource": ["arn:aws:iam::*:role/*"]
52 |     }
53 |   ]
54 | }
55 | 


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-ascs/sap-ec2-instance-stop-sap-ascs-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "description": "EC2 Stop - SAP ASCS ERS Cluster",
 3 |         "targets": {
 4 |                 "SAPASCSERSCLUSTER": {
 5 |                         "resourceType": "aws:ec2:instance",
 6 |                         "resourceTags": {
 7 |                                 "FIS-Application": "SAP",
 8 |                                 "FIS-Ready": "True",
 9 |                                 "FIS-SAP-App-Tier": "Application",
10 |                                 "FIS-SAP-Environment-Type": "Dev",
11 |                                 "FIS-SAP-HA-Node": "Primary",
12 |                                 "FIS-SAP-SID": "S4"
13 |                         },
14 |                         "selectionMode": "ALL"
15 |                 }
16 |         },
17 |         "actions": {
18 |                 "EC2STOP": {
19 |                         "actionId": "aws:ec2:stop-instances",
20 |                         "description": "Stop SAP ASCS Node",
21 |                         "parameters": {
22 |                                 "startInstancesAfterDuration": "PT5M"
23 |                         },
24 |                         "targets": {
25 |                                 "Instances": "SAPASCSERSCLUSTER"
26 |                         }
27 |                 }
28 |         },
29 |         "stopConditions": [
30 |                 {
31 |                         "source": "none"
32 |                 }
33 |         ],
34 |         "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
35 |         "tags": {
36 |                 "Name": "EC2 Stop - SAP ASCS ERS Cluster"
37 |         },
38 |         "experimentOptions": {
39 |                 "accountTargeting": "single-account",
40 |                 "emptyTargetResolutionMode": "fail"
41 |         }
42 | }


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-database/sap-ec2-instance-stop-sap-database-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "description": "EC2 Stop - SAP Database Cluster",
 3 |         "targets": {
 4 |                 "SAPDBCLUSTERNODE": {
 5 |                         "resourceType": "aws:ec2:instance",
 6 |                         "resourceTags": {
 7 |                                 "FIS-Application": "SAP",
 8 |                                 "FIS-Ready": "True",
 9 |                                 "FIS-SAP-App-Tier": "Database",
10 |                                 "FIS-SAP-Environment-Type": "Dev",
11 |                                 "FIS-SAP-HA-Node": "Primary",
12 |                                 "FIS-SAP-SID": "S4"
13 |                         },
14 |                         "selectionMode": "ALL"
15 |                 }
16 |         },
17 |         "actions": {
18 |                 "EC2STOP": {
19 |                         "actionId": "aws:ec2:stop-instances",
20 |                         "description": "EC2 Stop - SAP Database Cluster",
21 |                         "parameters": {
22 |                                 "startInstancesAfterDuration": "PT5M"
23 |                         },
24 |                         "targets": {
25 |                                 "Instances": "SAPDBCLUSTERNODE"
26 |                         }
27 |                 }
28 |         },
29 |         "stopConditions": [
30 |                 {
31 |                         "source": "none"
32 |                 }
33 |         ],
34 |         "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
35 |         "tags": {
36 |                 "Name": "EC2 Stop - SAP Database Cluster"
37 |         },
38 |         "experimentOptions": {
39 |                 "accountTargeting": "single-account",
40 |                 "emptyTargetResolutionMode": "fail"
41 |         }
42 | }


--------------------------------------------------------------------------------
/sap-ebs-pause-database-data/sap-ebs-pause-database-data-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "description": "EBS Pause - SAP Database Cluster",
 3 |         "targets": {
 4 |                 "EBSPAUSEDB": {
 5 |                         "resourceType": "aws:ec2:ebs-volume",
 6 |                         "resourceTags": {
 7 |                                 "FIS-Application": "SAP",
 8 |                                 "FIS-Ready": "True",
 9 |                                 "FIS-SAP-App-Tier": "Database",
10 |                                 "FIS-SAP-Database-Type": "Data",
11 |                                 "FIS-SAP-Environment-Type": "Dev",
12 |                                 "FIS-SAP-SID": "S4"
13 |                         },
14 |                         "selectionMode": "ALL",
15 |                         "parameters": {
16 |                                 "availabilityZoneIdentifier": "us-east-1a"
17 |                         }
18 |                 }
19 |         },
20 |         "actions": {
21 |                 "EBSPAUSEDB": {
22 |                         "actionId": "aws:ebs:pause-volume-io",
23 |                         "description": "EBS Pause - SAP DB",
24 |                         "parameters": {
25 |                                 "duration": "PT5M"
26 |                         },
27 |                         "targets": {
28 |                                 "Volumes": "EBSPAUSEDB"
29 |                         }
30 |                 }
31 |         },
32 |         "stopConditions": [
33 |                 {
34 |                         "source": "none"
35 |                 }
36 |         ],
37 |         "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
38 |         "tags": {
39 |                 "Name": "EBS Pause - SAP Database Cluster"
40 |         },
41 |         "experimentOptions": {
42 |                 "accountTargeting": "single-account",
43 |                 "emptyTargetResolutionMode": "fail"
44 |         }
45 | }


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "fis:InjectApiInternalError",
 8 |         "fis:InjectApiThrottleError",
 9 |         "fis:InjectApiUnavailableError"
10 |       ],
11 |       "Resource": "*"
12 |     },
13 |     {
14 |       "Effect": "Allow",
15 |       "Action": [
16 |         "rds:FailoverDBCluster",
17 |         "rds:DescribeDBClusters",
18 |         "rds:DescribeDBInstances"
19 |       ],
20 |       "Resource": "*",
21 |       "Condition": {
22 |         "StringEquals": {
23 |           "aws:ResourceTag/FIS-Ready": "True"
24 |         }
25 |       }
26 |     },
27 |     {
28 |       "Effect": "Allow",
29 |       "Action": [
30 |         "ssm:SendCommand",
31 |         "ssm:ListCommandInvocations",
32 |         "ssm:DescribeInstanceInformation",
33 |         "ssm:GetCommandInvocation",
34 |         "ssm:DescribeDocumentParameters"
35 |       ],
36 |       "Resource": [
37 |         "arn:aws:ssm:*:*:document/aurora-cluster-loadtest-document",
38 |         "arn:aws:ssm:*:*:document/AWS-RunShellScript"
39 |       ]
40 |     },
41 |     {
42 |       "Effect": "Allow",
43 |       "Action": [
44 |         "ssm:SendCommand"
45 |       ],
46 |       "Resource": "arn:aws:ec2:*:*:instance/*",
47 |       "Condition": {
48 |         "StringEquals": {
49 |           "aws:ResourceTag/FIS-Ready": "True"
50 |         }
51 |       }
52 |     },
53 |     {
54 |       "Effect": "Allow",
55 |       "Action": [
56 |         "ec2:DescribeInstances"
57 |       ],
58 |       "Resource": "*"
59 |     },
60 |     {
61 |       "Effect": "Allow",
62 |       "Action": [
63 |         "logs:CreateLogDelivery",
64 |         "logs:PutResourcePolicy",
65 |         "logs:DescribeResourcePolicies",
66 |         "logs:DescribeLogGroups"
67 |       ],
68 |       "Resource": "*"
69 |     }
70 |   ]
71 | }
72 | 


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-ascs/sap-ec2-instance-stop-sap-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"Version": "2012-10-17",
 3 | 	"Statement": [
 4 | 		{
 5 | 			"Sid": "AllowEc2Actions",
 6 | 			"Effect": "Allow",
 7 | 			"Action": [
 8 | 				"ec2:RebootInstances",
 9 | 				"ec2:StartInstances",
10 | 				"ec2:StopInstances"
11 | 			],
12 | 			"Resource": "arn:aws:ec2:*:*:instance/*"
13 | 		},
14 | 		{
15 | 			"Sid": "AllowEc2InstancesWithEncryptedEbsVolumes",
16 | 			"Effect": "Allow",
17 | 			"Action": [
18 | 				"kms:CreateGrant"
19 | 			],
20 | 			"Resource": [
21 | 				"arn:aws:kms:*:*:key/*"
22 | 			],
23 | 			"Condition": {
24 | 				"StringLike": {
25 | 					"kms:ViaService": "ec2.*.amazonaws.com"
26 | 				},
27 | 				"Bool": {
28 | 					"kms:GrantIsForAWSResource": "true"
29 | 				}
30 | 			}
31 | 		},
32 | 		{
33 | 			"Sid": "AllowSSMSendOnEc2",
34 | 			"Effect": "Allow",
35 | 			"Action": [
36 | 				"ssm:SendCommand"
37 | 			],
38 | 			"Resource": [
39 | 				"arn:aws:ec2:*:*:instance/*",
40 | 				"arn:aws:ssm:*:*:document/*"
41 | 			]
42 | 		},
43 | 		{
44 | 			"Sid": "AllowSSMStopOnEc2",
45 | 			"Effect": "Allow",
46 | 			"Action": [
47 | 				"ssm:CancelCommand",
48 | 				"ssm:ListCommands"
49 | 			],
50 | 			"Resource": "*"
51 | 		},
52 | 		{
53 | 			"Sid": "DescribeInstances",
54 | 			"Effect": "Allow",
55 | 			"Action": "ec2:DescribeInstances",
56 | 			"Resource": "*"
57 | 		},
58 |         {
59 |             "Effect": "Allow",
60 |             "Action": [
61 |                 "logs:CreateLogGroup",
62 |                 "logs:CreateLogStream",
63 |                 "logs:PutLogEvents",
64 |                 "logs:DescribeLogGroups",
65 |                 "logs:DescribeLogStreams"
66 |             ],
67 |             "Resource": "*"
68 |         },
69 |         {
70 |             "Effect": "Allow",
71 |             "Action": [
72 |                 "fis:StartExperiment",
73 |                 "fis:GetExperimentSummary",
74 |                 "fis:GetExperimentResults",
75 |                 "fis:StopExperiment"
76 |             ],
77 |             "Resource": "*"
78 |         }
79 | 	]
80 | }


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Aurora cluster CPU overload and failover experiment",
 3 |   "targets": {
 4 |     "AuroraCluster": {
 5 |       "resourceType": "aws:rds:cluster",
 6 |       "resourceTags": {
 7 |         "FIS-Ready": "True"
 8 |       },
 9 |       "selectionMode": "ALL"
10 |     },
11 |     "EC2Instance": {
12 |       "resourceType": "aws:ec2:instance",
13 |       "resourceTags": {
14 |         "FIS-Ready": "True"
15 |       },
16 |       "selectionMode": "ALL"
17 |     }
18 |   },
19 |   "actions": {
20 |     "DelayAction": {
21 |       "actionId": "aws:fis:wait",
22 |       "description": "Wait 5 minutes to establish baseline metrics",
23 |       "parameters": {
24 |         "duration": "PT5M"
25 |       }
26 |     },
27 |     "RunLoadTest": {
28 |       "actionId": "aws:ssm:send-command",
29 |       "description": "Execute CPU load test on Aurora cluster",
30 |       "parameters": {
31 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/aurora-cluster-loadtest-document",
32 |         "documentParameters": "{\"Duration\":\"600\",\"Concurrency\":\"10\"}",
33 |         "duration": "PT10M"
34 |       },
35 |       "targets": {
36 |         "Instances": "EC2Instance"
37 |       }
38 |     },
39 |     "FailoverCluster": {
40 |       "actionId": "aws:rds:failover-db-cluster",
41 |       "description": "Initiate Aurora cluster failover",
42 |       "parameters": {},
43 |       "targets": {
44 |         "Clusters": "AuroraCluster"
45 |       },
46 |       "startAfter": ["DelayAction"]
47 |     }
48 |   },
49 |   "stopConditions": [
50 |     {
51 |       "source": "none"
52 |     }
53 |   ],
54 |   "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/FISExperimentRole",
55 |   "tags": {
56 |     "Name": "Aurora-Cluster-CPU-Overload-Failover"
57 |   },
58 |   "logConfiguration": {
59 |     "logSchemaVersion": 2,
60 |     "cloudWatchLogsConfiguration": {
61 |       "logGroupArn": "arn:aws:logs:<YOUR REGION>:<YOUR AWS ACCOUNT>:log-group:FISExperimentLogs"
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/templates/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Spot Instances Interrupt
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 9 | 
10 | ## Description
11 | 
12 | ## Hypothesis
13 | 
14 | ## Prerequisites
15 | 
16 | Before running this experiment, ensure that:
17 | 
18 | 1. 
19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
20 | 3. The <RESOURCE TYPE> you want to target have the <LIST OF TAGS> tag.
21 | 
22 | ## How it works
23 | 
24 | 
25 | ## Stop Conditions
26 | 
27 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
28 | 
29 | ## Observability and stop conditions
30 | 
31 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
32 | business metric requiring an immediate end of the fault injection. This 
33 | template makes no assumptions about your application and the relevant metrics 
34 | and does not include stop conditions by default.
35 | 
36 | ## Next Steps
37 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-database/sap-ec2-instance-stop-sap-database-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "Version": "2012-10-17",
 3 |         "Statement": [
 4 |             {
 5 |                 "Effect": "Allow",
 6 |                 "Action": [
 7 |                     "fis:StartExperiment",
 8 |                     "fis:GetExperimentSummary",
 9 |                     "fis:GetExperimentResults",
10 |                     "fis:StopExperiment"
11 |                 ],
12 |                 "Resource": "*"
13 |             },
14 |             {
15 |                 "Effect": "Allow",
16 |                 "Action": "ec2:StopInstances",
17 |                 "Resource": "arn:aws:ec2:*:*:instance/*",
18 |                 "Condition": {
19 |                     "StringEquals": {
20 |                         "ec2:ResourceTag/FIS-Application": "SAP",
21 |                         "ec2:ResourceTag/FIS-Ready": "True",
22 |                         "ec2:ResourceTag/FIS-SAP-App-Tier": "Database",
23 |                         "ec2:ResourceTag/FIS-SAP-Environment-Type": "Dev",
24 |                         "ec2:ResourceTag/FIS-SAP-HA-Node": "Primary",
25 |                         "ec2:ResourceTag/FIS-SAP-SID": "S4"
26 |                     }
27 |                 }
28 |             },
29 |             {
30 |                 "Effect": "Allow",
31 |                 "Action": [
32 |                     "logs:CreateLogGroup",
33 |                     "logs:CreateLogStream",
34 |                     "logs:PutLogEvents",
35 |                     "logs:DescribeLogGroups",
36 |                     "logs:DescribeLogStreams"
37 |                 ],
38 |                 "Resource": "*"
39 |             },
40 |             {
41 |                 "Effect": "Allow",
42 |                 "Action": [
43 |                     "logs:CreateLogGroup",
44 |                     "logs:CreateLogStream",
45 |                     "logs:PutLogEvents",
46 |                     "logs:DescribeLogGroups",
47 |                     "logs:DescribeLogStreams"
48 |                 ],
49 |                 "Resource": "*"
50 |             },
51 |             {
52 |                 "Effect": "Allow",
53 |                 "Action": [
54 |                     "fis:StartExperiment",
55 |                     "fis:GetExperimentSummary",
56 |                     "fis:GetExperimentResults",
57 |                     "fis:StopExperiment"
58 |                 ],
59 |                 "Resource": "*"
60 |             }
61 |         ]
62 |     }
63 |     


--------------------------------------------------------------------------------
/ec2-instances-terminate/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
 9 | 
10 | ## Hypothesis
11 | 
12 | Our application will remain available, as 25% of the EC2 servers within my autoscaling group are terminated.
13 | 
14 | ## Prerequisites
15 | 
16 | Before running this experiment, ensure that:
17 | 
18 | 1. You have the necessary permissions to execute the FIS experiment and perform the termination of EC2 Spot Instance
19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
20 | 3. The EC2 Instance you want to target have the `FIS-Ready=True` tag.
21 | 
22 | ## Stop Conditions
23 | 
24 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
25 | 
26 | ## Observability and stop conditions
27 | 
28 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
29 | business metric requiring an immediate end of the fault injection. This 
30 | template makes no assumptions about your application and the relevant metrics 
31 | and does not include stop conditions by default.
32 | 
33 | ## Next Steps
34 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
35 | 
36 | ## Import Experiment
37 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 
38 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/elasticache-redis-primary-node-failover-automation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schemaVersion": "0.3",
 3 |   "description": "Test ElastiCache Redis failover using tag-based discovery",
 4 |   "assumeRole": "arn:aws:iam::{{ global:ACCOUNT_ID }}:role/ElastiCache-SSM-Automation-Role",
 5 |   "parameters": {
 6 |     "tagKey": {
 7 |       "type": "String",
 8 |       "description": "Tag key to identify ElastiCache clusters to target",
 9 |       "default": "FIS-Ready"
10 |     },
11 |     "tagValue": {
12 |       "type": "String",
13 |       "description": "Tag value to identify ElastiCache clusters to target",
14 |       "default": "True"
15 |     }
16 |   },
17 |   "mainSteps": [
18 |     {
19 |       "name": "triggerFailover",
20 |       "action": "aws:executeScript",
21 |       "inputs": {
22 |         "Runtime": "python3.11",
23 |         "Handler": "trigger_failover",
24 |         "Script": "import boto3\ndef trigger_failover(events, context):\n    region = events[\"region\"]\n    tag_key = events[\"tagKey\"]\n    tag_value = events[\"tagValue\"]\n    elasticache = boto3.client(\"elasticache\", region_name=region)\n    response = elasticache.describe_replication_groups()\n    for rg in response[\"ReplicationGroups\"]:\n        if rg[\"Status\"] == \"available\" and rg.get(\"AutomaticFailover\") == \"enabled\":\n            rg_id = rg[\"ReplicationGroupId\"]\n            try:\n                account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n                arn = \"arn:aws:elasticache:{}:{}:replicationgroup:{}\".format(region, account_id, rg_id)\n                tags_response = elasticache.list_tags_for_resource(ResourceName=arn)\n                tags = {tag[\"Key\"]: tag[\"Value\"] for tag in tags_response.get(\"TagList\", [])}\n                if tags.get(tag_key) == tag_value:\n                    elasticache.test_failover(ReplicationGroupId=rg_id, NodeGroupId=\"0001\")\n                    return {\"ReplicationGroupId\": rg_id, \"Status\": \"Failover initiated\"}\n            except Exception as e:\n                continue\n    raise Exception(\"No cluster found with tag {}={}\".format(tag_key, tag_value))",
25 |         "InputPayload": {
26 |           "region": "{{ global:REGION }}",
27 |           "tagKey": "{{ tagKey }}",
28 |           "tagValue": "{{ tagValue }}"
29 |         }
30 |       },
31 |       "outputs": [
32 |         {
33 |           "Name": "Result",
34 |           "Selector": "$.Payload",
35 |           "Type": "StringMap"
36 |         }
37 |       ]
38 |     }
39 |   ]
40 | }
41 | 


--------------------------------------------------------------------------------
/mysql-rds-loadtest-failover/mysql-rds-loadtest-failover-iam-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |         {
 5 |             "Effect": "Allow",
 6 |             "Action": [
 7 |                 "fis:StartExperiment",
 8 |                 "fis:GetExperimentSummary",
 9 |                 "fis:GetExperimentResults",
10 |                 "fis:StopExperiment"
11 |             ],
12 |             "Resource": "*"
13 |         },
14 |         {
15 |             "Effect": "Allow",
16 |             "Action": [
17 |                 "rds:RebootDBInstance",
18 |                 "rds:DescribeDBInstances",
19 |                 "rds:DescribeDBClusters"
20 |             ],
21 |             "Resource": "*",
22 |             "Condition": {
23 |                 "StringEquals": {
24 |                     "aws:ResourceTag/FIS-Ready": "True"
25 |                 }
26 |             }
27 |         },
28 |         {
29 |             "Effect": "Allow",
30 |             "Action": [
31 |                 "ssm:SendCommand"
32 |             ],
33 |             "Resource": [
34 |                 "arn:aws:ssm:*:*:document/*"
35 |             ]
36 |         },
37 |         {
38 |             "Effect": "Allow",
39 |             "Action": [
40 |                 "ssm:SendCommand"
41 |             ],
42 |             "Resource": [
43 |                 "arn:aws:ec2:*:*:instance/*"
44 |             ],
45 |             "Condition": {
46 |                 "StringEquals": {
47 |                     "aws:ResourceTag/FIS-Ready": "True"
48 |                 }
49 |             }
50 |         },
51 |         {
52 |             "Effect": "Allow",
53 |             "Action": [
54 |                 "ssm:GetCommandInvocation",
55 |                 "ssm:ListCommands",
56 |                 "ssm:ListCommandInvocations",
57 |                 "ssm:DescribeInstanceInformation"
58 |             ],
59 |             "Resource": "*"
60 |         },
61 |         {
62 |             "Effect": "Allow",
63 |             "Action": [
64 |                 "ec2:DescribeInstances"
65 |             ],
66 |             "Resource": "*"
67 |         },
68 |         {
69 |             "Effect": "Allow",
70 |             "Action": [
71 |                 "logs:CreateLogGroup",
72 |                 "logs:CreateLogStream",
73 |                 "logs:PutLogEvents",
74 |                 "logs:DescribeLogGroups",
75 |                 "logs:DescribeLogStreams",
76 |                 "logs:CreateLogDelivery",
77 |                 "logs:DeleteLogDelivery",
78 |                 "logs:DescribeResourcePolicies",
79 |                 "logs:PutResourcePolicy"
80 |             ],
81 |             "Resource": "*"
82 |         }
83 |     ]
84 | }
85 | 


--------------------------------------------------------------------------------
/mysql-rds-loadtest-failover/mysql-rds-loadtest-failover-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "MySQL RDS Load Test and Failover Experiment",
 3 |     "targets": {
 4 |         "LoadTestInstances": {
 5 |             "resourceType": "aws:ec2:instance",
 6 |             "resourceTags": {
 7 |                 "FIS-Ready": "True"
 8 |             },
 9 |             "selectionMode": "ALL"
10 |         },
11 |         "MySQLInstances": {
12 |             "resourceType": "aws:rds:db",
13 |             "resourceTags": {
14 |                 "FIS-Ready": "True"
15 |             },
16 |             "selectionMode": "ALL"
17 |         }
18 |     },
19 |     "actions": {
20 |         "RunLoadTest": {
21 |             "actionId": "aws:ssm:send-command",
22 |             "description": "Run MySQL high CPU load test until target CPU utilization is reached",
23 |             "parameters": {
24 |                 "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR AWS ACCOUNT>:document/MySQL-LoadTest-Document",
25 |                 "documentParameters": "{\"Duration\":\"600\",\"Concurrency\":\"25\",\"TargetCPU\":\"80\"}",
26 |                 "duration": "PT15M"
27 |             },
28 |             "targets": {
29 |                 "Instances": "LoadTestInstances"
30 |             }
31 |         },
32 |         "ForceFailover": {
33 |             "actionId": "aws:rds:reboot-db-instances",
34 |             "description": "Force a failover by rebooting the primary instance with failover",
35 |             "parameters": {
36 |                 "forceFailover": "true"
37 |             },
38 |             "targets": {
39 |                 "DBInstances": "MySQLInstances"
40 |             },
41 |             "startAfter": ["RunLoadTest"]
42 |         },
43 |         "StopLoadTest": {
44 |             "actionId": "aws:ssm:send-command",
45 |             "description": "Stop the load test after failover completes",
46 |             "parameters": {
47 |                 "documentArn": "arn:aws:ssm:<YOUR REGION>::document/AWS-RunShellScript",
48 |                 "documentParameters": "{\"commands\":[\"pkill -f 'mysql_load_worker'\",\"echo \\\"Load test stopped\\\"\"]}",
49 |                 "duration": "PT1M"
50 |             },
51 |             "targets": {
52 |                 "Instances": "LoadTestInstances"
53 |             },
54 |             "startAfter": ["ForceFailover"],
55 |             "startAfterDelay": "PT5M"
56 |         }
57 |     },
58 |     "stopConditions": [
59 |         {
60 |             "source": "none"
61 |         }
62 |     ],
63 |     "roleArn": "arn:aws:iam::<YOUR AWS ACCOUNT>:role/<YOUR ROLE NAME>",
64 |     "logConfiguration": {
65 |         "logSchemaVersion": 2,
66 |         "cloudWatchLogsConfiguration": {
67 |             "logGroupArn": "arn:aws:logs:<YOUR REGION>:<YOUR AWS ACCOUNT>:log-group:/aws/fis/experiment:*"
68 |         }
69 |     },
70 |     "tags": {
71 |         "Name": "MySQL-RDS-LoadTest-Failover"
72 |     },
73 |     "experimentOptions": {
74 |         "accountTargeting": "single-account",
75 |         "emptyTargetResolutionMode": "fail"
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/aurora-global-region-failover/README.md:
--------------------------------------------------------------------------------
 1 | # Aurora Global Database Regional Failover
 2 | 
 3 | This experiment performs Aurora Global Database regional failover/switchover to test disaster recovery procedures and measure RTO/RPO.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - Aurora Global Database with primary and secondary clusters
 8 | - Global cluster tagged with `FIS-Ready: True`
 9 | - IAM roles for FIS and SSM automation
10 | 
11 | ## Failover Types
12 | 
13 | - **Switchover** (default): Planned operation with no data loss for maintenance or testing
14 | - **Failover**: Emergency operation allowing data loss for disaster recovery
15 | 
16 | ## Files
17 | 
18 | - `aurora-global-region-failover-automation.yaml` - SSM automation document
19 | - `aurora-global-region-failover-experiment-template.json` - FIS experiment template
20 | - `aurora-global-region-failover-fis-role-iam-policy.json` - IAM policy for FIS role
21 | - `aurora-global-region-failover-ssm-automation-role-iam-policy.json` - IAM policy for SSM role
22 | - `fis-iam-trust-relationship.json` - Trust relationship for FIS role
23 | - `ssm-iam-trust-relationship.json` - Trust relationship for SSM role
24 | 
25 | ## Setup
26 | 
27 | 1. Create IAM roles:
28 |    ```bash
29 |    aws iam create-role --role-name <FIS-ROLE-NAME> --assume-role-policy-document file://fis-iam-trust-relationship.json
30 |    aws iam put-role-policy --role-name <FIS-ROLE-NAME> --policy-name <FIS-POLICY-NAME> --policy-document file://aurora-global-region-failover-fis-role-iam-policy.json
31 |    
32 |    aws iam create-role --role-name <SSM-ROLE-NAME> --assume-role-policy-document file://ssm-iam-trust-relationship.json
33 |    aws iam put-role-policy --role-name <SSM-ROLE-NAME> --policy-name <SSM-POLICY-NAME> --policy-document file://aurora-global-region-failover-ssm-automation-role-iam-policy.json
34 |    ```
35 | 
36 | 2. Create SSM automation document:
37 |    ```bash
38 |    aws ssm create-document --name aurora-global-region-failover-automation --document-type Automation --content file://aurora-global-region-failover-automation.yaml --document-format YAML
39 |    ```
40 | 
41 | 3. Update experiment template with your values and create:
42 |    ```bash
43 |    # Edit aurora-global-region-failover-experiment-template.json with your account/region/cluster details
44 |    aws fis create-experiment-template --cli-input-json file://aurora-global-region-failover-experiment-template.json
45 |    ```
46 | 
47 | ## Parameters
48 | 
49 | - `globalClusterIdentifier`: Aurora Global Database cluster identifier (required)
50 | - `failoverType`: "switchover" for planned operations or "failover" for emergency with data loss (default: "switchover")
51 | - `AutomationAssumeRole`: IAM role ARN for automation execution (required)
52 | 
53 | ## Usage
54 | 
55 | Run the FIS experiment to perform a managed failover/switchover of the Aurora Global Database:
56 | 
57 | ```bash
58 | aws fis start-experiment --experiment-template-id <TEMPLATE-ID>
59 | ```
60 | 
61 | The experiment will automatically detect the secondary cluster and promote it to primary based on the configured failover type.
62 | 


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: Stopping IIS on Windows EC2 Instance
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
 9 | 
10 | ## Hypothesis
11 | 
12 | Our application will remain available and resilient when IIS (Internet Information Services) is stopped on one of our Windows EC2 instances, simulating a scenario where the web server crashes or fails to start.
13 | 
14 | ![Stop IIS Experiment](images/ssm.png)
15 | 
16 | ## Prerequisites
17 | 
18 | Before running this experiment, ensure that:
19 | 
20 | 1. You have the necessary permissions to execute the FIS experiment and perform actions on Windows EC2 instances.
21 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the IIS stopping operation.
22 | 3. The Windows EC2 instances you want to target have the `FIS-Ready=True` tag.
23 | 4. SSM Agent is installed and running on the target Windows EC2 instances.
24 | 5. The IAM role associated with the EC2 instances has the necessary permissions for SSM.
25 | 
26 | ## Stop Conditions
27 | 
28 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until the IIS stopping action has been completed on the targeted resources.
29 | 
30 | ## Observability and stop conditions
31 | 
32 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
33 | business metric requiring an immediate end of the fault injection. This 
34 | template makes no assumptions about your application and the relevant metrics 
35 | and does not include stop conditions by default.
36 | 
37 | ## Next Steps
38 | As you adapt this scenario to your needs, we recommend:
39 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
40 | 2. Identifying business metrics tied to the IIS service availability.
41 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of stopping IIS.
42 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached.
43 | 5. Implementing proper logging and monitoring to track the behavior of your application when IIS is stopped.
44 | 
45 | ## Import Experiment
46 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 
47 | 


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-ascs/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
 9 | 
10 | ## Description
11 | 
12 | Explore the impact of the interruption of EC2 Instance which hosts the SAP database. 
13 | 
14 | In this experiment we target EC2 Instances in the current region that have a specific tag attached. 
15 | 
16 | ## Hypothesis
17 | 
18 | When an interruption occurs on the EC2 Instances hosting the ABAP SAP Central Services (ASCS), the ASCS process will failover to the Standby EC2 instance hosting EnQue Replication Server (ERS). The failover will occur within 5-15 minutes and user can resume operations. This validates SAP application cluster configuration.
19 | 
20 | ## Prerequisites
21 | 
22 | Before running this experiment, ensure that:
23 | 
24 | 1. You have the necessary permissions to execute the FIS experiment.
25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
26 | 3. All your AWS resources are correctly tagged. 
27 | ```
28 |     "FIS-Application": "SAP",
29 |     "FIS-Ready": "True",
30 |     "FIS-SAP-App-Tier": "Application",
31 |     "FIS-SAP-Environment-Type": "Dev",
32 |     "FIS-SAP-HA-Node": "Primary",
33 |     "FIS-SAP-SID": "S4"
34 | ```
35 | 
36 | ## Stop Conditions
37 | 
38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
39 | 
40 | ## Observability and stop conditions
41 | 
42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
43 | business metric requiring an immediate end of the fault injection. This 
44 | template makes no assumptions about your application and the relevant metrics 
45 | and does not include stop conditions by default.
46 | 
47 | ## Next Steps
48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
49 | 
50 | ## Import Experiment
51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 


--------------------------------------------------------------------------------
/aurora-cluster-failover/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: Aurora Cluster Failover
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and 
 4 | fis-template-library-tooling. This experiment template requires deployment into 
 5 | your AWS account and requires resources in your AWS account to inject faults into.
 6 | 
 7 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 8 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 9 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
10 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 
12 | ## Description
13 | 
14 | Explore the impact of failing over of an Amazon Aurora cluster. 
15 | 
16 | In this experiment we target an Amazon Aurora Cluster in the current region that have a specific tag attached. 
17 | 
18 | ## Hypothesis
19 | 
20 | Failover of an Aurora Cluster between the reader and writer instance may cause requests to fail for a brief period of time, but requests will automatically recover, and the application will continue to function as normal after the failover.
21 | 
22 | ## Prerequisites
23 | 
24 | Before running this experiment, ensure that:
25 | 
26 | 1. You have the necessary permissions to execute the FIS experiment and perform the failover operation on the targeted Aurora clusters.
27 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the failover operation.
28 | 3. The Aurora clusters you want to target have the `FIS-Ready=True` tag.
29 | 4. The targeted Aurora clusters are configured for Multi-AZ deployment with writer and reader instances, and proper replication is set up.
30 | 
31 | ## How it works
32 | 
33 | This template simulate an Aurora DB cluster failover for a DB cluster. It will promotes one of the Aurora Replicas (read-only instances) in the DB cluster to be the primary DB instance (the cluster writer). To use the scenario you must have Amazon Aurora clusters that have the tag `FIS-Ready=True`.
34 | 
35 | ## Observability and stop conditions
36 | 
37 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
38 | business metric requiring an immediate end of the fault injection. This 
39 | template makes no assumptions about your application and the relevant metrics 
40 | and does not include stop conditions by default.
41 | 
42 | ## Next Steps
43 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
44 | 
45 | ## Import Experiment
46 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 


--------------------------------------------------------------------------------
/sap-ebs-pause-database-data/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
 9 | 
10 | ## Description
11 | 
12 | Explore the impact of the interruption of EBS Volume which hosts the SAP database. 
13 | 
14 | In this experiment we target EC2 Instances in the current region that have a specific tag attached. 
15 | 
16 | ## Hypothesis
17 | 
18 | When an interruption occurs on the Block Storage drive attached to EC2 Instances hosting the SAP database, the application will unable to write data causing a failover to occur to the Standby EC2 instance hosted in another AZ. The failover will occur within 15-30 minutes and user can resume operations. Application has requirement of RTO of 30 minutes RPO of near zero. This validates SAP database cluster configuration.
19 | 
20 | ## Prerequisites
21 | 
22 | Before running this experiment, ensure that:
23 | 
24 | 1. You have the necessary permissions to execute the FIS experiment.
25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
26 | 3. All your AWS resources are correctly tagged. 
27 | ```
28 |     "FIS-Application": "SAP",
29 |     "FIS-Ready": "True",
30 |     "FIS-SAP-App-Tier": "Database",
31 |     "FIS-SAP-Database-Type": "Data",
32 |     "FIS-SAP-Environment-Type": "Dev",
33 |     "FIS-SAP-SID": "S4"
34 | ```
35 | 
36 | ## Stop Conditions
37 | 
38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
39 | 
40 | ## Observability and stop conditions
41 | 
42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
43 | business metric requiring an immediate end of the fault injection. This 
44 | template makes no assumptions about your application and the relevant metrics 
45 | and does not include stop conditions by default.
46 | 
47 | ## Next Steps
48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
49 | 
50 | ## Import Experiment
51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 
52 | 


--------------------------------------------------------------------------------
/sap-ec2-instance-stop-database/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Instance Termination
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
 9 | 
10 | ## Description
11 | 
12 | Explore the impact of the stopping of the EC2 Instances which is hosting the SAP database. 
13 | 
14 | In this experiment we target EC2 Spot Instances in the current region that have a specific tag attached. 
15 | 
16 | ## Hypothesis
17 | 
18 | When an interruption occurs on the EC2 Instances hosting the SAP database, the application will failover to the Standby EC2 instance hosted in another AZ. The failover will occur within 15-30 minutes and user can resume operations. Application has requirement of RTO of 30 minutes RPO of near zero. This validates SAP database cluster configuration.
19 | 
20 | ## Prerequisites
21 | 
22 | Before running this experiment, ensure that:
23 | 
24 | 1. You have the necessary permissions to execute the FIS experiment.
25 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
26 | 3. All your AWS resources are correctly tagged. 
27 | ```
28 |     "ec2:ResourceTag/FIS-Application": "SAP",
29 |     "ec2:ResourceTag/FIS-Ready": "True",
30 |     "ec2:ResourceTag/FIS-SAP-App-Tier": "Database",
31 |     "ec2:ResourceTag/FIS-SAP-Environment-Type": "Dev",
32 |     "ec2:ResourceTag/FIS-SAP-HA-Node": "Primary",
33 |     "ec2:ResourceTag/FIS-SAP-SID": "S4"
34 | ```
35 | 
36 | ## Stop Conditions
37 | 
38 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
39 | 
40 | ## Observability and stop conditions
41 | 
42 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
43 | business metric requiring an immediate end of the fault injection. This 
44 | template makes no assumptions about your application and the relevant metrics 
45 | and does not include stop conditions by default.
46 | 
47 | ## Next Steps
48 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
49 | 
50 | ## Import Experiment
51 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiments
 2 | 
 3 | This repository contains a collection of AWS Fault Injection Service (FIS) experiments designed to test the resilience and fault tolerance of your AWS resources and applications. These experiments simulate various failure scenarios to help you identify potential vulnerabilities and validate your system's ability to recover from disruptions.
 4 | 
 5 | ## Available Experiments
 6 | 
 7 | Browse the experiment directories to find templates for various fault injection scenarios:
 8 | 
 9 | - **EC2 Instance Management**: `ec2-instances-terminate/`, `ec2-spot-interruption/`, `ec2-windows-stop-iis/`
10 | - **Database Resilience**: `aurora-cluster-failover/`, `sap-ebs-pause-database-data/`
11 | - **SAP Systems**: `sap-ec2-instance-stop-ascs/`, `sap-ec2-instance-stop-database/`
12 | - **Simple Queue Service (SQS)**: `sqs-queue-impairment/`
13 | 
14 | Each experiment directory contains:
15 | - Complete FIS experiment template (JSON)
16 | - Required IAM policies and trust relationships
17 | - Comprehensive README with setup instructions
18 | - Additional automation files where applicable
19 | 
20 | ## Getting Started
21 | 
22 | To use these experiments, follow these steps:
23 | 
24 | 1. **Prerequisites**: Ensure you have the necessary permissions and IAM roles configured to run FIS experiments in your AWS account.
25 | 
26 | 2. **Choose an Experiment**: Browse the available experiment directories and select one that matches your testing scenario.
27 | 
28 | 3. **Review Documentation**: Read the experiment's README.md file thoroughly to understand prerequisites, expected behavior, and safety considerations.
29 | 
30 | 4. **Configuration**: Customize the template files by replacing placeholder values (e.g., `<YOUR AWS ACCOUNT>`, `<YOUR REGION>`) with your specific AWS account information.
31 | 
32 | 5. **Deploy**: Import the experiment template into your AWS account using the [FIS Template Library Tooling](https://github.com/aws-samples/fis-template-library-tooling).
33 | 
34 | 6. **Execute Safely**: Run the experiment in a non-production environment first, with proper monitoring and stop conditions in place.
35 | 
36 | 7. **Monitor and Analyze**: Observe the impact on your resources and analyze the results to improve your system's resilience.
37 | 
38 | ## Contributing
39 | 
40 | We welcome contributions of new FIS experiment templates! 
41 | 
42 | **📋 Before contributing, please read our [Style Guide](STYLE_GUIDE.md) which details all requirements and standards.**
43 | 
44 | Key requirements for contributions:
45 | - Follow the standardized directory structure and file naming conventions
46 | - Include comprehensive documentation with safety disclaimers
47 | - Provide complete IAM policies following least privilege principles
48 | - Include observability and monitoring recommendations
49 | - Reference the `ec2-windows-stop-iis/` directory as the gold standard example
50 | 
51 | See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed contribution guidelines.
52 | 
53 | ## Disclaimer
54 | 
55 | These experiments are designed to simulate failure scenarios in your AWS environment. While precautions have been taken to minimize potential risks, running these experiments may cause temporary disruptions or outages to your resources and applications. It is highly recommended to thoroughly review and test the experiments in a non-production environment before running them in a production setting.
56 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/sqs-queue-impairment-tag-based-experiment-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Simulate worsening impairment of SQS queues with specific tag by applying deny-all policy for increasing durations",
 3 |   "targets": {},
 4 |   "actions": {
 5 |     "impairSqs2m": {
 6 |       "actionId": "aws:ssm:start-automation-execution",
 7 |       "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 2 minutes",
 8 |       "parameters": {
 9 |         "maxDuration": "PT1H",
10 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
11 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT2M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
12 |       },
13 |       "targets": {}
14 |     },
15 |     "wait3m1": {
16 |       "actionId": "aws:fis:wait",
17 |       "description": "Wait for 3 minutes",
18 |       "parameters": {
19 |         "duration": "PT3M"
20 |       },
21 |       "targets": {},
22 |       "startAfter": ["impairSqs2m"]
23 |     },
24 |     "impairSqs5m": {
25 |       "actionId": "aws:ssm:start-automation-execution",
26 |       "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 5 minutes",
27 |       "parameters": {
28 |         "maxDuration": "PT1H",
29 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
30 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT5M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
31 |       },
32 |       "targets": {},
33 |       "startAfter": ["wait3m1"]
34 |     },
35 |     "wait3m2": {
36 |       "actionId": "aws:fis:wait",
37 |       "description": "Wait for 3 minutes",
38 |       "parameters": {
39 |         "duration": "PT3M"
40 |       },
41 |       "targets": {},
42 |       "startAfter": ["impairSqs5m"]
43 |     },
44 |     "impairSqs7m": {
45 |       "actionId": "aws:ssm:start-automation-execution",
46 |       "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 7 minutes",
47 |       "parameters": {
48 |         "maxDuration": "PT1H",
49 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
50 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT7M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
51 |       },
52 |       "targets": {},
53 |       "startAfter": ["wait3m2"]
54 |     },
55 |     "wait2m1": {
56 |       "actionId": "aws:fis:wait",
57 |       "description": "Wait for 2 minutes",
58 |       "parameters": {
59 |         "duration": "PT2M"
60 |       },
61 |       "targets": {},
62 |       "startAfter": ["impairSqs7m"]
63 |     },
64 |     "impairSqs15m": {
65 |       "actionId": "aws:ssm:start-automation-execution",
66 |       "description": "Simulate worsening impairment of SQS queues by applying deny-all policy for 15 minutes",
67 |       "parameters": {
68 |         "maxDuration": "PT1H",
69 |         "documentArn": "arn:aws:ssm:<YOUR REGION>:<YOUR ACCOUNT>:document/<YOUR AUTOMATION DOCUMENT NAME>",
70 |         "documentParameters": "{\"tagKey\": \"FIS-Ready\", \"tagValue\": \"True\", \"duration\": \"PT15M\", \"AutomationAssumeRole\": \"arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR SSM AUTOMATION IAM ROLE NAME>\"}"
71 |       },
72 |       "targets": {},
73 |       "startAfter": ["wait2m1"]
74 |     }
75 |   },
76 |   "stopConditions": [
77 |     {
78 |       "source": "none"
79 |     }
80 |   ],
81 |   "roleArn": "arn:aws:iam::<YOUR ACCOUNT>:role/<YOUR FIS IAM ROLE NAME>",
82 |   "tags": {
83 |     "Name": "SimulateSqsImpairment",
84 |     "Purpose": "resilience-testing"
85 |   },
86 |   "experimentOptions": {
87 |     "accountTargeting": "single-account",
88 |     "emptyTargetResolutionMode": "skip"
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/ec2-spot-interruption/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: EC2 Spot Instances Interrupt
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 9 | 
10 | ## Description
11 | 
12 | Explore the impact of the termination of EC2 Spot Instances. 
13 | 
14 | In this experiment we target EC2 Spot Instances in the current region that have a specific tag attached. 
15 | 
16 | ## Hypothesis
17 | 
18 | When an interruption occurs on EC2 Spot Instances, instances will gracefully terminate, and applications or services running on those instances will be automatically restarted on new Spot Instances or fallback to On-Demand Instances, ensuring minimal disruption to the overall system.
19 | 
20 | Specifically, we expect the following behavior:
21 | 
22 | 1. **Graceful Termination**: Upon receiving the interruption signal, EC2 Spot Instances will initiate a graceful termination process, allowing applications or services to perform any necessary cleanup tasks or save their state before terminating.
23 | 
24 | 2. **Automatic Restarting**: Applications or services running on the interrupted Spot Instances are configured for automatic restart and will be automatically launched on new Spot Instances or fallback to On-Demand Instances, depending on the defined scaling policies and capacity provisioning strategies.
25 | 
26 | 3. **Load Balancing and Failover**: If the applications or services are running behind a load balancer, traffic will be automatically rerouted to the newly launched instances, ensuring seamless failover and minimizing downtime.
27 | 
28 | 4. **Data Persistence**: Any persistent data or state associated with the applications or services running on the interrupted Spot Instances will be successfully recovered or replicated to the new instances, ensuring data consistency and integrity.
29 | 
30 | 5. **Monitoring and Alerting**: The interruption event and subsequent recovery actions will be captured by the monitoring and alerting systems, providing visibility into the system's behavior and enabling timely incident response and analysis.
31 | 
32 | By validating this hypothesis, we can demonstrate the resilience of our applications and services running on EC2 Spot Instances and ensure that they can gracefully handle interruptions while minimizing the impact on end-users or customers.
33 | 
34 | ## Prerequisites
35 | 
36 | Before running this experiment, ensure that:
37 | 
38 | 1. You have the necessary permissions to execute the FIS experiment and perform the termination of EC2 Spot Instance
39 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the termination operation.
40 | 3. The EC2 Spot Instance you want to target have the `FIS-Ready=True` tag.
41 | 
42 | ## How it works
43 | 
44 | The experiment sends an interruption signal to 25% of targeted EC2 Spot Instances using the AWS API `aws:ec2:send-spot-instance-interruptions`. This action simulates a real-world scenario where the Spot Instances are interrupted due to changes in the Spot market or capacity constraints.
45 | 
46 | `durationBeforeInterruption`: A duration of 4 minutes (PT4M) is set before the interruption is triggered. This allows for any necessary preparations or cleanup tasks to be executed before the interruption occurs.
47 | 
48 | ## Stop Conditions
49 | 
50 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all targeted resources have been interrupted.
51 | 
52 | ## Observability and stop conditions
53 | 
54 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
55 | business metric requiring an immediate end of the fault injection. This 
56 | template makes no assumptions about your application and the relevant metrics 
57 | and does not include stop conditions by default.
58 | 
59 | ## Next Steps
60 | As you adapt this scenario to your needs, we recommend reviewing the tag names you use to ensure they fit your specific use case, identifying business metrics tied to the instances you are stopping, creating an Amazon CloudWatch metric and Amazon CloudWatch alarm, and adding a stop condition tied to the alarm.
61 | 
62 | ## Import Experiment
63 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling). 
64 | 


--------------------------------------------------------------------------------
/dynamodb-traffic-blackhole-region-impairment/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: DynamoDB Traffic Blackhole Region Impairment
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 6 | 
 7 | ## Example Hypothesis
 8 | 
 9 | When network connectivity to DynamoDB is completely blocked from my application subnets, monitoring systems should detect the connectivity failure within 2-3 minutes and trigger alerts. The DevOps team should be notified within 5 minutes through our alerting channels. If automated failover is configured, it should activate within 10 minutes. For manual intervention, the team should acknowledge the incident within 15 minutes and complete failover procedures within 30-45 minutes. All DynamoDB operations should fail with network timeout errors during the 10-minute impairment period.
10 | 
11 | ### What does this enable me to verify?
12 | 
13 | * Network-level DynamoDB connectivity monitoring and alerting works correctly
14 | * Application timeout and retry logic handles network failures appropriately  
15 | * Circuit breaker patterns function as expected for DynamoDB connectivity issues
16 | * Graceful degradation or failover mechanisms activate when DynamoDB is unreachable
17 | * Error handling and user experience during complete DynamoDB network blackouts
18 | * Recovery behavior when network connectivity is restored
19 | 
20 | ## Prerequisites
21 | 
22 | Before running this experiment, ensure that:
23 | 
24 | 1. You have created the IAM role for FIS with the provided policy document
25 | 2. You have created the FIS Experiment Template from the sample provided
26 | 3. **Update the AWS account ID** in the template files to match your account
27 | 4. The EC2 subnets containing your application instances have the "FIS-Ready":"True" tag
28 | 5. Your application instances are running in the tagged subnets and actively using DynamoDB
29 | 6. You have appropriate monitoring and observability in place to track the impact
30 | 
31 | ## How it works
32 | 
33 | This experiment uses the `aws:network:disrupt-connectivity` action with `scope: dynamodb` to block all network traffic between your application subnets and the DynamoDB regional endpoints.
34 | 
35 | ### Network ACL Mechanism
36 | 
37 | FIS temporarily:
38 | 1. Clones the existing network ACL associated with target subnets
39 | 2. Adds deny rules to block DynamoDB traffic in the cloned ACL
40 | 3. Associates the modified ACL with your subnets for the experiment duration
41 | 4. Automatically restores the original ACL when the experiment completes
42 | 
43 | ### Duration and Scope
44 | 
45 | - **Duration**: 10 minutes (configurable via `duration` parameter)
46 | - **Scope**: DynamoDB regional endpoints only - other AWS services remain accessible
47 | - **Traffic Blocked**: All inbound and outbound DynamoDB API calls from target subnets
48 | - **Intra-subnet**: Traffic between instances in the same subnet remains unaffected
49 | 
50 | ## Target Resources
51 | 
52 | This experiment targets EC2 subnets tagged with `FIS-Ready: True`. All instances in these subnets will lose DynamoDB connectivity during the experiment.
53 | 
54 | ## Stop Conditions
55 | 
56 | The experiment includes basic stop conditions. Consider adding CloudWatch alarms for:
57 | - Application error rates exceeding thresholds
58 | - Critical business metrics falling below acceptable levels
59 | - Infrastructure health checks failing
60 | 
61 | ## Observability Recommendations
62 | 
63 | Monitor these metrics during the experiment:
64 | - DynamoDB API call success/failure rates
65 | - Application error logs and exception counts
66 | - Network connectivity metrics from application instances
67 | - Circuit breaker state changes
68 | - User experience and transaction success rates
69 | 
70 | ## Safety Considerations
71 | 
72 | - Test in non-production environments first
73 | - Ensure your application can handle DynamoDB connectivity failures gracefully
74 | - Have rollback procedures ready if manual intervention is needed
75 | - Consider the impact on dependent services and downstream systems
76 | - Verify that critical business processes have appropriate fallback mechanisms
77 | 
78 | ## Files Included
79 | 
80 | - `dynamodb-traffic-blackhole-region-impairment-template.json` - FIS experiment template
81 | - `dynamodb-traffic-blackhole-region-impairment-iam-policy.json` - Required IAM permissions
82 | - `fis-iam-trust-relationship.json` - IAM trust relationship for FIS role
83 | - `AWSFIS.json` - Template version marker
84 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Primary Node Reboot
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 6 | 
 7 | ## Example Hypothesis
 8 | 
 9 | When the Redis primary node is rebooted, applications should detect the brief connection disruption and reconnect automatically within 30 seconds. Connection pooling should handle the temporary unavailability gracefully, and no data should be lost during the reboot. Application performance should return to normal within 60 seconds of the node becoming available again.
10 | 
11 | ### What does this enable me to verify?
12 | 
13 | * Appropriate Redis connection monitoring and observability is in place (were you able to detect the reboot?)
14 | * Alarms are configured correctly for node availability changes (were the right people notified?)
15 | * Your application handles brief Redis connection disruptions gracefully
16 | * Connection pooling and retry logic work correctly during node reboots
17 | * Recovery controls and reconnection mechanisms work as expected
18 | 
19 | ## Prerequisites
20 | 
21 | Before running this experiment, ensure that:
22 | 
23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided.
24 | 2. You have created the SSM Automation Document from the sample provided (elasticache-redis-primary-node-reboot-automation.yaml)
25 | 3. You have created the FIS Experiment Template from the sample provided (elasticache-redis-primary-node-reboot-experiment-template.json)
26 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value
27 | 5. Your Redis cluster has Multi-AZ enabled with `AutomaticFailover=enabled`
28 | 6. You have appropriate monitoring and observability in place to track the impact of the experiment.
29 | 
30 | ## How it works
31 | 
32 | This experiment reboots the Redis primary node to test application resilience during brief connection disruptions. The experiment follows this sequence:
33 | 
34 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True"
35 | 2. **Primary Identification**: Dynamically finds the current primary node using NodeGroups and CurrentRole
36 | 3. **Node Reboot**: Executes `reboot_cache_cluster` on the primary node
37 | 4. **Recovery Monitoring**: Tracks node status from "Rebooting cache cluster nodes" to "Available"
38 | 
39 | The reboot is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document identifies the primary node and reboots it, then monitors the recovery process until the node returns to available status.
40 | 
41 | To verify the experiment is working properly, you can monitor the node status and test connectivity:
42 | 
43 | ```bash
44 | # Monitor Redis connectivity during reboot
45 | watch -n 5 'redis-cli -h <redis-endpoint> ping'
46 | 
47 | # Check node status
48 | aws elasticache describe-replication-groups --replication-group-id <YOUR-CLUSTER-ID> --query 'ReplicationGroups[0].NodeGroups[0].NodeGroupMembers[?CurrentRole==`primary`].CacheNodeStatus'
49 | 
50 | # Monitor application health
51 | curl -I https://<your-app>/health
52 | ```
53 | 
54 | During the experiment, you should see the node status change from "Available" to "Rebooting cache cluster nodes" and back to "Available" within 1-3 minutes.
55 | 
56 | ## Stop Conditions
57 | 
58 | The experiment does not have any specific stop conditions defined. The reboot completes automatically when the node returns to "Available" status.
59 | 
60 | ## Observability and stop conditions
61 | 
62 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default.
63 | 
64 | ## Next Steps
65 | 
66 | As you adapt this scenario to your needs, we recommend:
67 | 
68 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
69 | 2. Identifying business metrics tied to your Redis operations, such as connection counts and application response times.
70 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis node availability and connection health.
71 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed.
72 | 5. Implementing appropriate connection retry logic in your application to handle brief node unavailability.
73 | 6. Testing your application's Redis connection pooling and recovery mechanisms.
74 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly.
75 | 
76 | ## Import Experiment
77 | 
78 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
79 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing FIS Experiment Templates
24 | 
25 | This repository contains AWS Fault Injection Service (FIS) experiment templates. When contributing new templates or modifying existing ones, please follow our comprehensive style guide.
26 | 
27 | **📋 [Read the complete FIS Template Style Guide](STYLE_GUIDE.md) before contributing**
28 | 
29 | ### Template Contribution Checklist
30 | 
31 | Before submitting a FIS template, ensure you have:
32 | 
33 | - [ ] **Reviewed the style guide**: Read [STYLE_GUIDE.md](STYLE_GUIDE.md) thoroughly
34 | - [ ] **Used the correct structure**: Follow the required directory and file naming conventions
35 | - [ ] **Included all required files**: README.md, AWSFIS.json, template JSON, IAM policy, and trust relationship
36 | - [ ] **Validated JSON files**: All JSON must be valid and properly formatted  
37 | - [ ] **Included safety disclaimers**: Use exact disclaimer text as specified
38 | - [ ] **Written comprehensive documentation**: Include hypothesis, prerequisites, and next steps
39 | - [ ] **Followed security best practices**: IAM policies use least privilege with resource tag conditions
40 | - [ ] **Used proper parameterization**: Replace account-specific values with `<YOUR AWS ACCOUNT>` placeholders
41 | - [ ] **Added CloudWatch recommendations**: Include observability guidance in next steps
42 | - [ ] **Referenced the gold standard**: Compare your template against `ec2-windows-stop-iis/` example
43 | - [ ] **SSM document compliance**: If using SSM documents, follow comprehensive SSM best practices in the style guide
44 | 
45 | ### Template Testing Requirements
46 | 
47 | Before submission, verify your template:
48 | 
49 | 1. **JSON validation**: Use a JSON validator on all `.json` files
50 | 2. **Markdown validation**: Check formatting with a markdown linter  
51 | 3. **Deployment testing**: Test the template in a sandbox AWS environment
52 | 4. **Documentation accuracy**: Verify all instructions are clear and complete
53 | 5. **Security review**: Confirm IAM policies follow least privilege principles
54 | 
55 | ## Contributing via Pull Requests
56 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
57 | 
58 | 1. You are working against the latest source on the *main* branch.
59 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
60 | 3. **For FIS templates**: You have completed the template checklist above and reviewed the [style guide](STYLE_GUIDE.md).
61 | 4. You open an issue to discuss any significant work - we would hate for your time to be wasted.
62 | 
63 | To send us a pull request, please:
64 | 
65 | 1. Fork the repository.
66 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
67 | 3. **For FIS templates**: Ensure your template follows the [style guide](STYLE_GUIDE.md) requirements.
68 | 4. Ensure local tests pass.
69 | 5. Commit to your fork using clear commit messages.
70 | 6. Send us a pull request, answering any default questions in the pull request interface.
71 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
72 | 
73 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
74 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
75 | 
76 | 
77 | ## Finding contributions to work on
78 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
79 | 
80 | 
81 | ## Code of Conduct
82 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
83 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
84 | opensource-codeofconduct@amazon.com with any additional questions or comments.
85 | 
86 | 
87 | ## Security issue notifications
88 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
89 | 
90 | 
91 | ## Licensing
92 | 
93 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
94 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Connection Failure
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 6 | 
 7 | ## Example Hypothesis
 8 | 
 9 | When Redis connections are disrupted, applications should gracefully handle the failure through circuit breaker mechanisms within 30 seconds. Client retry storms should be prevented, and applications should continue operating in degraded mode without cascading failures. Once Redis connectivity is restored, normal operations should resume within 60 seconds.
10 | 
11 | ### What does this enable me to verify?
12 | 
13 | * Appropriate Redis connectivity monitoring and observability is in place (were you able to detect the connection failure?)
14 | * Alarms are configured correctly for connectivity issues (were the right people notified?)
15 | * Your applications handle Redis unavailability gracefully without cascading failures
16 | * Redis client circuit breaker functionality works correctly
17 | * Client-side retry logic doesn't create amplification effects or retry storms
18 | * Recovery controls work as expected when Redis connectivity is restored
19 | 
20 | ## Prerequisites
21 | 
22 | Before running this experiment, ensure that:
23 | 
24 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided.
25 | 2. You have created the SSM Automation Document from the sample provided (redis-connection-failure-automation.yaml)
26 | 3. You have created the FIS Experiment Template from the sample provided (redis-connection-failure-experiment-template.json)
27 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value
28 | 5. Your applications implement proper Redis client circuit breakers and retry logic
29 | 6. You have appropriate monitoring and observability in place to track the impact of the experiment.
30 | 
31 | ## How it works
32 | 
33 | This experiment simulates Redis connection failures by modifying ElastiCache security groups to block connections for a specified duration. The experiment follows this sequence:
34 | 
35 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True"
36 | 2. **Connection Disruption**: Removes security group rules to block Redis access from applications
37 | 3. **Sustained Failure**: Maintains connection disruption for specified duration to test resilience
38 | 4. **Restoration**: Restores security group rules to resume normal connectivity
39 | 
40 | The connection failure is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document modifies security group rules to block access to Redis, then restores connectivity after the specified duration.
41 | 
42 | To verify the experiment is working properly, you can monitor Redis connectivity and application behavior:
43 | 
44 | ```bash
45 | # Monitor Redis connectivity
46 | watch -n 5 'redis-cli -h <redis-endpoint> ping'
47 | 
48 | # Check application health endpoints
49 | curl -I https://<your-app>/health
50 | 
51 | # Monitor security group rules
52 | aws ec2 describe-security-groups --group-ids <security-group-id> --query 'SecurityGroups[0].IpPermissions'
53 | ```
54 | 
55 | During the experiment, you should observe connection timeouts when attempting to reach Redis and applications activating circuit breakers or degraded mode operations.
56 | 
57 | ## Stop Conditions
58 | 
59 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped.
60 | 
61 | ## Observability and stop conditions
62 | 
63 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default.
64 | 
65 | ## Next Steps
66 | 
67 | As you adapt this scenario to your needs, we recommend:
68 | 
69 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
70 | 2. Identifying business metrics tied to your Redis connectivity, such as cache hit rates and application error rates.
71 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis connectivity and circuit breaker activation.
72 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed.
73 | 5. Implementing appropriate circuit breakers in your application to handle Redis unavailability gracefully.
74 | 6. Testing your application's behavior under various connection failure scenarios and durations.
75 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly.
76 | 
77 | ## Import Experiment
78 | 
79 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
80 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: SQS Queue Impairment
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 6 | 
 7 | ## Example Hypothesis
 8 | 
 9 | When the SQS service is experiencing an impairment in a region which impacts my application, an alarm should be raised and the DevOps team notified within 5 minutes. Functionality relating to component A should not be available to end users during the impairment; however, other components should continue to operate normally. Once the SQS impairment has been resolved, component A should become available to end users within 5 minutes.
10 | 
11 | ### What does this enable me to verify?
12 | 
13 | * Appropriate customer experience metrics and observability of SQS is in place (were you able to detect there was a problem?)
14 | * Alarms are configured correctly (were the right people notified and/or automations triggered?)
15 | * Your app gracefully degrades and customers aren't submitting transactions which you know will fail
16 | * Your circuit breaker (if any) works as expected
17 | * Recovery controls (if any) work as expected
18 | 
19 | ## Prerequisites
20 | 
21 | Before running this experiment, ensure that:
22 | 
23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided.
24 | 2. You have created the SSM Automation Document from the sample provided (sqs-queue-impairment-tag-based-automation.yaml)
25 | 3. You have created the FIS Experiment Template from the sample provided (sqs-queue-impairment-tag-based-template.json)
26 | 3. The SQS queue(s) you want to target have the "FIS-Ready":"True" tag and value
27 | 5. You have appropriate monitoring and observability in place to track the impact of the experiment.
28 | 
29 | ## How it works
30 | 
31 | This experiment simulates a worsening impairment of an SQS queue by applying a deny-all policy that blocks access to the queue for increasing durations. The experiment follows this sequence:
32 | 
33 | 1. First impairment: Blocks access to the SQS queue for 2 minutes
34 | 2. Wait period: 3 minutes of normal operation
35 | 3. Second impairment: Blocks access to the SQS queue for 5 minutes
36 | 4. Wait period: 3 minutes of normal operation
37 | 5. Third impairment: Blocks access to the SQS queue for 7 minutes
38 | 6. Wait period: 2 minutes of normal operation
39 | 7. Fourth impairment: Blocks access to the SQS queue for 15 minutes
40 | 
41 | The impairment is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document adds a deny statement to the SQS queue policy that prevents all principals from performing key operations like sending and receiving messages. After the specified duration, the Automation Document removes the deny statement, restoring normal access to the queue.
42 | 
43 | To verify the experiment is setup and working properly, you can use the AWS CLI to attempt operations on a targeted SQS queue:
44 | 
45 | ```bash
46 | watch -n 5 'aws sqs send-message --queue-url "https://sqs.<YOUR REGION>.amazonaws.com/<YOUR AWS ACCOUNT>/<YOUR SQS QUEUE>" --message-body "This is a test message" --region <YOUR REGION> --no-cli-pager'
47 | ```
48 | 
49 | During the impairment periods, you should see "AccessDenied" errors when attempting to send or receive messages from the queue.
50 | 
51 | ![FIS Console showing actions](./images/sqs.png "FIS Console showing actions")
52 | 
53 | ## Stop Conditions
54 | 
55 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped.
56 | 
57 | ## Observability and stop conditions
58 | 
59 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default.
60 | 
61 | ## Next Steps
62 | 
63 | As you adapt this scenario to your needs, we recommend:
64 | 
65 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
66 | 2. Identifying business metrics tied to your SQS queue processing, such as application transaction rates.
67 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of the SQS impairment.
68 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached.
69 | 5. Implementing appropriate circuit breakers in your application to handle SQS service impairments gracefully.
70 | 6. Testing your application's recovery mechanisms to ensure they work as expected after the SQS service is restored.
71 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly.
72 | 
73 | ## Import Experiment
74 | 
75 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
76 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-failover/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Fault Injection Service Experiment: ElastiCache Redis Primary Node Failover
 2 | 
 3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
 4 | 
 5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 6 | 
 7 | ## Example Hypothesis
 8 | 
 9 | When the Redis primary node fails over to a replica, applications should detect the failover and reconnect to the new primary within 30 seconds. Connection pooling should handle the DNS endpoint changes gracefully, and no data should be lost during the transition. Application performance should return to normal within 60 seconds of failover completion.
10 | 
11 | ### What does this enable me to verify?
12 | 
13 | * Appropriate Redis connection monitoring and observability is in place (were you able to detect the failover?)
14 | * Alarms are configured correctly for primary node changes (were the right people notified?)
15 | * Your application handles Redis primary node changes gracefully
16 | * Connection pooling and DNS resolution work correctly during failover
17 | * Recovery controls and reconnection logic work as expected
18 | 
19 | ## Prerequisites
20 | 
21 | Before running this experiment, ensure that:
22 | 
23 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided.
24 | 2. You have created the SSM Automation Document from the sample provided (elasticache-redis-primary-node-failover-automation.json)
25 | 3. You have created the FIS Experiment Template from the sample provided (elasticache-redis-primary-node-failover-experiment-template.json)
26 | 4. The ElastiCache Redis cluster(s) you want to target have the "FIS-Ready":"True" tag and value
27 | 5. Your Redis cluster has Multi-AZ enabled with `AutomaticFailover=enabled`
28 | 6. Your cluster has at least 1 primary + 1 replica node
29 | 7. You have appropriate monitoring and observability in place to track the impact of the experiment.
30 | 
31 | ## How it works
32 | 
33 | This experiment forces a Redis primary node failover by using the ElastiCache TestFailover API to promote a replica node to primary. The experiment follows this sequence:
34 | 
35 | 1. **Dynamic Discovery**: Scans all ElastiCache replication groups to find clusters tagged with "FIS-Ready":"True"
36 | 2. **Validation**: Ensures the cluster has `AutomaticFailover=enabled` and is in `available` status
37 | 3. **Failover Trigger**: Uses `test_failover` API to promote a replica to primary role
38 | 4. **DNS Update**: ElastiCache automatically updates the master endpoint to point to the new primary
39 | 5. **Role Swap**: The former primary becomes a replica, and the replica becomes the new primary
40 | 
41 | The failover is implemented using an SSM Automation Document invoked by FIS. The SSM Automation Document uses the ElastiCache TestFailover API to trigger the failover process, which automatically promotes a replica to primary and updates the DNS endpoint.
42 | 
43 | To verify the experiment is working properly, you can monitor the primary node before and after:
44 | 
45 | ```bash
46 | # Check current primary before experiment
47 | aws elasticache describe-replication-groups --replication-group-id <YOUR-CLUSTER-ID> --query 'ReplicationGroups[0].NodeGroups[0].NodeGroupMembers[?CurrentRole==`primary`].CacheClusterId'
48 | 
49 | # Monitor during experiment
50 | watch -n 5 'aws elasticache describe-replication-groups --replication-group-id <YOUR-CLUSTER-ID> --query "ReplicationGroups[0].Status"'
51 | ```
52 | 
53 | During the failover, you should see the cluster status change from "available" to "modifying" and back to "available" as the primary node changes.
54 | 
55 | ## Stop Conditions
56 | 
57 | The experiment does not have any specific stop conditions defined. The failover completes automatically when the TestFailover operation finishes and the cluster returns to "available" status.
58 | 
59 | ## Observability and stop conditions
60 | 
61 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default.
62 | 
63 | ## Next Steps
64 | 
65 | As you adapt this scenario to your needs, we recommend:
66 | 
67 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
68 | 2. Identifying business metrics tied to your Redis operations, such as cache hit rates and application response times.
69 | 3. Creating Amazon CloudWatch metrics and alarms to monitor Redis failover impact.
70 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if needed.
71 | 5. Implementing appropriate connection retry logic in your application to handle primary node changes.
72 | 6. Testing your application's Redis connection pooling and DNS resolution during failover scenarios.
73 | 7. Documenting the findings from your experiment and updating your incident response procedures accordingly.
74 | 
75 | ## Import Experiment
76 | 
77 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
78 | 


--------------------------------------------------------------------------------
/mysql-rds-loadtest-failover/README.md:
--------------------------------------------------------------------------------
  1 | # AWS Fault Injection Service Experiment: MySQL RDS Load Test and Failover
  2 | 
  3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
  4 | 
  5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  6 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  7 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  8 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
  9 | 
 10 | ## Hypothesis
 11 | 
 12 | When high CPU load is generated on a Multi-AZ MySQL RDS instance followed by a failover event, the system will transition from the primary to the standby instance with approximately 25 seconds of downtime, and applications implementing proper connection handling will automatically reconnect with a success rate of nearly 100%, maintaining normal functionality once the failover process completes.
 13 | 
 14 | ## Prerequisites
 15 | 
 16 | Before running this experiment, ensure that:
 17 | 
 18 | 1. You have the necessary permissions to execute the FIS experiment and perform the failover operation on RDS instances.
 19 | 2. The IAM role specified in the `roleArn` field has the required permissions to perform the failover operation and execute SSM documents.
 20 | 3. The MySQL RDS instances you want to target have the `FIS-Ready=True` tag.
 21 | 4. **You have an EC2 instance tagged with `FIS-Ready=True` that serves as the load generator**
 22 |    - This instance must have network connectivity to your MySQL RDS instance
 23 |    - The instance must have the SSM Agent installed and running
 24 |    - The instance requires appropriate IAM permissions to execute SSM documents
 25 |    - The instance will execute CPU-intensive database queries against the MySQL RDS instance
 26 | 5. The targeted MySQL RDS instances are configured for Multi-AZ deployment.
 27 | 6. The IAM role associated with the EC2 instances has the necessary permissions for SSM.
 28 | 7. You have deployed the SSM document template (`mysql-rds-loadtest-failover-ssm-template.json`) to your account.
 29 | 
 30 | ## Architecture Overview
 31 | 
 32 | This experiment uses the following components:
 33 | 
 34 | - **MySQL RDS Instance**: The target database that will experience CPU load and failover
 35 | - **EC2 Load Generator Instance**: Executes the SSM document to generate database load
 36 | - **SSM Document**: Contains the load testing scripts that create CPU-intensive queries
 37 | - **FIS Experiment**: Orchestrates the load generation and failover sequence
 38 | 
 39 | **Critical**: The EC2 instance acts as the load generator and must be able to connect to your MySQL RDS instance. The SSM document will be executed on this instance, not directly on the RDS instance.
 40 | 
 41 | ## EC2 Instance Setup
 42 | 
 43 | Your EC2 instance must meet these requirements:
 44 | 
 45 | 1. **Network Access**: Security groups must allow outbound connections to MySQL RDS on port 3306
 46 | 2. **MySQL Client**: Install `mysql-client` or equivalent for database connectivity
 47 | 3. **SSM Agent**: Ensure SSM Agent is installed and the instance appears in Systems Manager
 48 | 4. **IAM Role**: Attach an IAM role with `AmazonSSMManagedInstanceCore` policy
 49 | 5. **Tagging**: Tag the instance with `FIS-Ready=True`
 50 | 
 51 | Test connectivity before running the experiment:
 52 | ```bash
 53 | mysql -h your-rds-endpoint -u your-username -p -e "SELECT 1;"
 54 | ```
 55 | 
 56 | ## ⚠️ Database Impact Warning
 57 | 
 58 | **IMPORTANT**: This experiment will create test tables in your MySQL database:
 59 | 
 60 | ### Tables Created:
 61 | - `loadtest` - Load testing table with auto-increment primary key
 62 | - Test database (if `DBName` parameter specifies a non-existing database)
 63 | 
 64 | ### Impact:
 65 | - Tables will persist after the experiment completes
 66 | - Test data will be inserted during load testing
 67 | - No existing data will be modified or deleted
 68 | - Tables use `IF NOT EXISTS` clauses to avoid conflicts
 69 | 
 70 | ### Cleanup:
 71 | If you need to remove the test table after the experiment, you can manually drop it:
 72 | ```sql
 73 | DROP TABLE IF EXISTS loadtest;
 74 | ```
 75 | 
 76 | ## Stop Conditions
 77 | 
 78 | The experiment does not have any specific stop conditions defined. It will continue to run until manually stopped or until all actions have been completed on the targeted resources.
 79 | 
 80 | ## Observability and stop conditions
 81 | 
 82 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or 
 83 | business metric requiring an immediate end of the fault injection. This 
 84 | template makes no assumptions about your application and the relevant metrics 
 85 | and does not include stop conditions by default.
 86 | 
 87 | ## Next Steps
 88 | 
 89 | As you adapt this scenario to your needs, we recommend:
 90 | 
 91 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
 92 | 2. Identifying business metrics tied to your MySQL RDS instance performance.
 93 | 3. Creating an Amazon CloudWatch metric and Amazon CloudWatch alarm to monitor the impact of high CPU load and failover.
 94 | 4. Adding a stop condition tied to the alarm to automatically halt the experiment if critical thresholds are breached.
 95 | 5. Customizing the SSM document parameters to adjust load test concurrency, duration, and target CPU utilization.
 96 | 6. Testing the load generation script independently before running the full FIS experiment.
 97 | 
 98 | ## Import Experiment
 99 | 
100 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
101 | 
102 | ## Monitoring Recommendations
103 | 
104 | For optimal experiment observability, consider monitoring these key metrics during execution:
105 | - RDS CPU Utilization (target: sustained high load before failover)
106 | - RDS Database Connections (monitor connection drops during failover)
107 | - Application response times and error rates
108 | - RDS Failover completion time via CloudWatch Events
109 | 


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/README.md:
--------------------------------------------------------------------------------
  1 | # AWS Fault Injection Service Experiment: Aurora Cluster CPU Overload and Failover
  2 | 
  3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
  4 | 
  5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  6 | 
  7 | ## Description
  8 | 
  9 | This experiment simulates CPU overload on an Aurora PostgreSQL cluster and then initiates a failover to test the resilience of your database infrastructure under stress conditions.
 10 | 
 11 | ## Hypothesis
 12 | 
 13 | When high CPU load occurs on an Aurora cluster followed by a subsequent failover, the system will restore normal operation with minimal disruption, and the application's functionality will remain largely unaffected. The automatic recovery process will complete within minutes, and the system's request processing capability will maintain continuity at near 100% efficiency after the failover completes.
 14 | 
 15 | ## Prerequisites
 16 | 
 17 | Before running this experiment, ensure that:
 18 | 
 19 | 1. You have an Aurora PostgreSQL cluster tagged with `FIS-Ready=True`
 20 | 2. **You have an EC2 instance tagged with `FIS-Ready=True` that serves as the load generator**
 21 |    - This instance must have network connectivity to your Aurora cluster
 22 |    - The instance must have the SSM Agent installed and running
 23 |    - The instance requires appropriate IAM permissions to execute SSM documents
 24 |    - The instance will execute CPU-intensive database queries against the Aurora cluster
 25 | 3. The Aurora cluster is configured for Multi-AZ deployment with writer and reader instances
 26 | 4. You have created the required IAM role with the provided policy document
 27 | 5. You have deployed the SSM document for load testing
 28 | 6. You have configured appropriate CloudWatch monitoring and alarms
 29 | 
 30 | ## Architecture Overview
 31 | 
 32 | This experiment uses the following components:
 33 | 
 34 | - **Aurora PostgreSQL Cluster**: The target database that will experience CPU load and failover
 35 | - **EC2 Load Generator Instance**: Executes the SSM document to generate database load
 36 | - **SSM Document**: Contains the load testing scripts that create CPU-intensive queries
 37 | - **FIS Experiment**: Orchestrates the load generation and failover sequence
 38 | 
 39 | **Critical**: The EC2 instance acts as the load generator and must be able to connect to your Aurora cluster. The SSM document will be executed on this instance, not directly on the Aurora cluster.
 40 | 
 41 | ## EC2 Instance Setup
 42 | 
 43 | Your EC2 instance must meet these requirements:
 44 | 
 45 | 1. **Network Access**: Security groups must allow outbound connections to Aurora on port 5432
 46 | 2. **PostgreSQL Client**: Install `postgresql-client` or equivalent for database connectivity
 47 | 3. **SSM Agent**: Ensure SSM Agent is installed and the instance appears in Systems Manager
 48 | 4. **IAM Role**: Attach an IAM role with `AmazonSSMManagedInstanceCore` policy
 49 | 5. **Tagging**: Tag the instance with `FIS-Ready=True`
 50 | 
 51 | Test connectivity before running the experiment:
 52 | ```bash
 53 | psql -h your-aurora-endpoint -U your-username -d your-database -c "SELECT 1;"
 54 | ```
 55 | 
 56 | ## ⚠️ Database Impact Warning
 57 | 
 58 | **IMPORTANT**: This experiment will create test tables in your Aurora PostgreSQL database:
 59 | 
 60 | ### Tables Created:
 61 | - `load_test_users` - User records with status and timestamps
 62 | - `load_test_transactions` - Transaction records with foreign key relationships
 63 | 
 64 | ### Impact:
 65 | - Tables will persist after the experiment completes
 66 | - Test data will be inserted during load testing
 67 | - No existing data will be modified or deleted
 68 | - Tables use `IF NOT EXISTS` clauses to avoid conflicts
 69 | - Indexes will be created for performance testing
 70 | 
 71 | ### Cleanup:
 72 | If you need to remove the test tables after the experiment, you can manually drop them:
 73 | ```sql
 74 | DROP TABLE IF EXISTS load_test_transactions;
 75 | DROP TABLE IF EXISTS load_test_users;
 76 | ```
 77 | 
 78 | ## How it works
 79 | 
 80 | This experiment simulates a high CPU load scenario followed by an Aurora DB cluster failover:
 81 | 
 82 | 1. **Baseline establishment**: 5-minute delay to establish baseline metrics
 83 | 2. **CPU load generation**: SSM document executes CPU-intensive queries on the Aurora cluster
 84 | 3. **Failover initiation**: After the delay, promotes an Aurora Replica to be the primary writer
 85 | 4. **Impact observation**: Load test continues to observe failover impact on performance
 86 | 
 87 | The experiment targets resources with the `FIS-Ready=True` tag for safety and control.
 88 | 
 89 | ## Observability and stop conditions
 90 | 
 91 | This template does not include stop conditions by default. You should add CloudWatch alarms based on your specific operational metrics to automatically halt the experiment if critical thresholds are breached.
 92 | 
 93 | ## Files included
 94 | 
 95 | - `aurora-cluster-failover-template.json`: Main FIS experiment template
 96 | - `aurora-cluster-failover-iam-policy.json`: Required IAM permissions
 97 | - `aurora-cluster-failover-ssm-template.json`: SSM document for load testing
 98 | - `fis-iam-trust-relationship.json`: IAM trust policy for FIS service
 99 | 
100 | ## Next steps
101 | 
102 | 1. Review and customize the experiment parameters for your environment
103 | 2. Set up CloudWatch monitoring and create appropriate alarms
104 | 3. Add stop conditions based on your operational metrics
105 | 4. Test the experiment in a non-production environment first
106 | 5. Create a CloudWatch dashboard to visualize experiment effects
107 | 
108 | ## Import experiment
109 | 
110 | You can import the JSON experiment template into your AWS account via CLI or AWS CDK. For step-by-step instructions, see the [fis-template-library-tooling](https://github.com/aws-samples/fis-template-library-tooling) repository.
111 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/README.md:
--------------------------------------------------------------------------------
  1 | # AWS Fault Injection Service Experiment: DynamoDB Region Impairment
  2 | 
  3 | This is an experiment template for use with AWS Fault Injection Service (FIS) and fis-template-library-tooling. This experiment template requires deployment into your AWS account and requires resources in your AWS account to inject faults into.
  4 | 
  5 | THIS TEMPLATE WILL INJECT REAL FAULTS! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  6 | 
  7 | ## Example Hypothesis
  8 | 
  9 | When DynamoDB experiences a complete regional failure in us-east-1 which impacts my global table application, an alarm should be raised and the DevOps team notified within 5 minutes. The application should automatically failover to the us-west-2 replica within 2 minutes. During the impairment, all read and write operations should be redirected to the healthy region. Once the regional failure is resolved, the application should resume normal cross-region operation within 5 minutes.
 10 | 
 11 | ### What does this enable me to verify?
 12 | 
 13 | * Appropriate customer experience metrics and observability of DynamoDB global tables is in place (were you able to detect there was a problem?)
 14 | * Alarms are configured correctly (were the right people notified and/or automations triggered?)
 15 | * Your app gracefully fails over to the healthy region and customers can continue using the application
 16 | * Your circuit breaker (if any) works as expected for regional failures
 17 | * Recovery controls (if any) work as expected when the region comes back online
 18 | * Cross-region replication monitoring and alerting functions correctly
 19 | 
 20 | ## Prerequisites
 21 | 
 22 | Before running this experiment, ensure that:
 23 | 
 24 | 1. You have the roles created for FIS and SSM Automation to use. Example IAM policy documents and trust policies are provided.
 25 | 2. You have created the SSM Automation Document from the sample provided (dynamodb-region-impairment-automation.yaml)
 26 | 3. You have created the FIS Experiment Template from the sample provided (dynamodb-region-impairment-experiment-template.json)
 27 | 4. **Update all region references** in the template files to match your target region (currently set to us-east-1 as an example)
 28 | 5. **Update table names** in the template files to match your DynamoDB global table names
 29 | 6. The DynamoDB global table(s) you want to target have the "FIS-Ready":"True" tag and value
 30 | 7. You have appropriate monitoring and observability in place to track the impact of the experiment.
 31 | 
 32 | ## How it works
 33 | 
 34 | This experiment simulates a complete regional DynamoDB failure by combining two complementary actions:
 35 | 
 36 | ### Timeline
 37 | - **T+0**: Both actions start simultaneously
 38 | - **T+10s**: SSM automation applies application-blocking policy (after FIS policy is established)  
 39 | - **T+10m**: SSM automation completes and cleans up its policy statements
 40 | - **T+12m**: FIS built-in action completes and auto-expires its policy statements
 41 | 
 42 | **Note**: The durations can be modified to fit your testing needs, but the staggered timing should be maintained to prevent race conditions and ensure proper cleanup sequencing.
 43 | 
 44 | ### Actions
 45 | 
 46 | **1. Native FIS Action (aws:dynamodb:global-table-pause-replication)**
 47 | - Blocks DynamoDB replication service from synchronizing data between regions
 48 | - Duration: 12 minutes
 49 | - Uses time-based auto-expiring resource policy statements
 50 | - Automatically cleans up when experiment completes
 51 | 
 52 | **2. Custom SSM Automation (blockDynamoDBAccess)**
 53 | - Blocks all application access (reads/writes) to the table in the target region
 54 | - Duration: 10 minutes with 10-second initial delay to avoid race conditions
 55 | - Uses resource policy with role exclusions for FIS, SSM, and DynamoDB service roles
 56 | - Includes proper cleanup logic to remove only its policy statements
 57 | 
 58 | ### Race Condition Prevention
 59 | Both actions start simultaneously but modify the same DynamoDB resource policy. A 10-second sleep was added at the start of the SSM automation document to prevent race conditions - allowing the built-in FIS action to successfully apply its policy first.
 60 | 
 61 | ### Recovery Window Testing
 62 | This creates a 2-minute recovery window (minutes 10-12) where application access is restored but replication remains paused, allowing testing of partial recovery scenarios and cross-region failover behavior.
 63 | 
 64 | To verify the experiment is setup and working properly, you can use the AWS CLI to attempt operations on a targeted DynamoDB table:
 65 | 
 66 | ```bash
 67 | # Test application access (should fail during impairment)
 68 | watch -n 5 'aws dynamodb put-item --table-name my-global-table --item "{\"id\":{\"S\":\"test-$(date +%s)\"},\"message\":{\"S\":\"test message\"}}" --region us-east-1 --no-cli-pager'
 69 | 
 70 | # Test reads (should also fail during impairment)  
 71 | watch -n 5 'aws dynamodb get-item --table-name my-global-table --key "{\"id\":{\"S\":\"test-item\"}}" --region us-east-1 --no-cli-pager'
 72 | 
 73 | # Test failover region (should continue working)
 74 | watch -n 5 'aws dynamodb put-item --table-name my-global-table --item "{\"id\":{\"S\":\"test-$(date +%s)\"},\"message\":{\"S\":\"test message\"}}" --region us-west-2 --no-cli-pager'
 75 | ```
 76 | 
 77 | During the impairment periods, you should see "AccessDenied" errors when attempting operations on the us-east-1 table, while us-west-2 operations continue normally.
 78 | 
 79 | ## Stop Conditions
 80 | 
 81 | The experiment does not have any specific stop conditions defined. It will continue to run until all actions are completed or until manually stopped.
 82 | 
 83 | ## Observability and stop conditions
 84 | 
 85 | Stop conditions are based on an AWS CloudWatch alarm based on an operational or business metric requiring an immediate end of the fault injection. This template makes no assumptions about your application and the relevant metrics and does not include stop conditions by default.
 86 | 
 87 | ## Next Steps
 88 | 
 89 | As you adapt this scenario to your needs, we recommend:
 90 | 
 91 | 1. Reviewing the tag names you use to ensure they fit your specific use case.
 92 | 2. Identifying business metrics tied to your DynamoDB global table operations, such as application transaction rates and cross-region latency.
 93 | 3. Creating Amazon CloudWatch metrics and alarms to monitor:
 94 |    - Application error rates during regional failures
 95 |    - Cross-region failover time
 96 |    - Data consistency after recovery
 97 |    - Replication lag between regions
 98 | 4. Adding stop conditions tied to critical business metrics to automatically halt the experiment if unacceptable impact occurs.
 99 | 5. Implementing appropriate circuit breakers in your application to handle regional DynamoDB failures gracefully.
100 | 6. Testing your application's regional failover mechanisms to ensure they work as expected.
101 | 7. Validating that your monitoring can distinguish between planned chaos experiments and real outages.
102 | 8. Documenting the findings from your experiment and updating your incident response procedures accordingly.
103 | 9. Testing recovery procedures to ensure applications properly resume cross-region operations after the experiment.
104 | 
105 | ## Import Experiment
106 | 
107 | You can import the json experiment template into your AWS account via cli or aws cdk. For step by step instructions on how, [click here](https://github.com/aws-samples/fis-template-library-tooling).
108 | 


--------------------------------------------------------------------------------
/sqs-queue-impairment/sqs-queue-impairment-tag-based-automation.yaml:
--------------------------------------------------------------------------------
  1 | description: "Apply a deny-all policy to SQS queues with specific tag to simulate impairment using SQS:AddPermission and SQS:RemovePermission"
  2 | schemaVersion: "0.3"
  3 | assumeRole: "{{ AutomationAssumeRole }}"
  4 | parameters:
  5 |   tagKey:
  6 |     type: String
  7 |     description: "Tag key to identify SQS queues to impair"
  8 |     default: "FIS-Ready"
  9 |   tagValue:
 10 |     type: String
 11 |     description: "Tag value to identify SQS queues to impair"
 12 |     default: "True"
 13 |   duration:
 14 |     type: String
 15 |     description: "Duration of the impairment in ISO8601 format"
 16 |     default: "PT10M"
 17 |   region:
 18 |     type: String
 19 |     description: "AWS Region of the SQS queues"
 20 |     default: "{{global:REGION}}"
 21 |   AutomationAssumeRole:
 22 |     type: String
 23 |     description: "IAM role for the automation execution"
 24 |     default: ""
 25 | 
 26 | mainSteps:
 27 |   - name: getTargetQueues
 28 |     action: aws:executeScript
 29 |     inputs:
 30 |       Runtime: python3.11
 31 |       Handler: get_queues
 32 |       Script: |
 33 |         import boto3
 34 | 
 35 |         def get_queues(events, context):
 36 |             region = events['region']
 37 |             tag_key = events['tagKey']
 38 |             tag_value = events['tagValue']
 39 |             
 40 |             sqs = boto3.client('sqs', region_name=region)
 41 |             target_queues = []
 42 |             
 43 |             # Find queues by tag
 44 |             response = sqs.list_queues()
 45 |             if 'QueueUrls' not in response:
 46 |                 return []
 47 |                 
 48 |             for queue_url in response['QueueUrls']:
 49 |                 try:
 50 |                     tags_response = sqs.list_queue_tags(QueueUrl=queue_url)
 51 |                     tags = tags_response.get('Tags', {})
 52 |                     
 53 |                     if tags.get(tag_key) == tag_value:
 54 |                         target_queues.append(queue_url)
 55 |                 except Exception as e:
 56 |                     print(f"Error getting tags for queue {queue_url}: {str(e)}")
 57 |                     continue
 58 |                     
 59 |             return target_queues
 60 |       InputPayload:
 61 |         region: "{{ region }}"
 62 |         tagKey: "{{ tagKey }}"
 63 |         tagValue: "{{ tagValue }}"
 64 |     outputs:
 65 |       - Name: targetQueues
 66 |         Selector: $.Payload
 67 |         Type: StringList
 68 |     description: "Find all SQS queues with the specified tag or use the provided queue URL"
 69 | 
 70 |   - name: applyDenyAllPolicyToQueues
 71 |     action: aws:executeScript
 72 |     onFailure: "step:removeDenyAllPolicyFromQueues"
 73 |     onCancel: "step:removeDenyAllPolicyFromQueues"
 74 |     inputs:
 75 |       Runtime: python3.11
 76 |       Handler: apply_deny_policy
 77 |       Script: |
 78 |         import boto3
 79 |         import json
 80 | 
 81 |         def apply_deny_policy(events, context):
 82 |             region = events['region']
 83 |             target_queues = events['targetQueues']
 84 |             
 85 |             sqs = boto3.client('sqs', region_name=region)
 86 |             results = []
 87 |             
 88 |             for queue_url in target_queues:
 89 |                 try:
 90 |                     # Get existing policy
 91 |                     response = sqs.get_queue_attributes(
 92 |                         QueueUrl=queue_url,
 93 |                         AttributeNames=['Policy']
 94 |                     )
 95 |                     
 96 |                     existing_policy = {}
 97 |                     if 'Policy' in response.get('Attributes', {}):
 98 |                         existing_policy = json.loads(response['Attributes']['Policy'])
 99 |                     else:
100 |                         existing_policy = {
101 |                             "Version": "2012-10-17",
102 |                             "Statement": []
103 |                         }
104 |                     
105 |                     # Add deny statement
106 |                     deny_statement = {
107 |                         "Effect": "Deny",
108 |                         "Principal": "*",
109 |                         "Action":  [
110 |                           "sqs:DeleteMessage",
111 |                           "sqs:ChangeMessageVisibility",
112 |                           "sqs:PurgeQueue",
113 |                           "sqs:ReceiveMessage",
114 |                           "sqs:SendMessage"
115 |                         ],
116 |                         "Resource": "*",
117 |                         "Sid": "FISTemporaryDeny"
118 |                     }
119 |                     
120 |                     # Remove any existing statement with the same Sid to avoid duplicates
121 |                     existing_policy['Statement'] = [s for s in existing_policy.get('Statement', []) 
122 |                                                   if s.get('Sid') != 'FISTemporaryDeny']
123 |                     
124 |                     existing_policy['Statement'].append(deny_statement)
125 |                     sqs.set_queue_attributes(
126 |                         QueueUrl=queue_url,
127 |                         Attributes={
128 |                             'Policy': json.dumps(existing_policy)
129 |                         }
130 |                     )
131 | 
132 |                     results.append(f"Successfully applied deny policy to {queue_url}")
133 |                 except Exception as e:
134 |                     raise RuntimeError(f"Failed to apply SQS policy to {queue_url}: {e}") from e
135 |             
136 |             return {
137 |                 'affectedQueues': target_queues,
138 |                 'results': results
139 |             }
140 |       InputPayload:
141 |         region: "{{ region }}"
142 |         targetQueues: "{{ getTargetQueues.targetQueues }}"
143 |     outputs:
144 |       - Name: affectedQueues
145 |         Selector: $.Payload.affectedQueues
146 |         Type: StringList
147 |       - Name: results
148 |         Selector: $.Payload.results
149 |         Type: StringList
150 |     description: "Apply deny-all policy to all target SQS queues"
151 | 
152 |   - name: waitForDuration
153 |     action: "aws:sleep"
154 |     onFailure: "step:removeDenyAllPolicyFromQueues"
155 |     onCancel: "step:removeDenyAllPolicyFromQueues"
156 |     inputs:
157 |       Duration: "{{ duration }}"
158 |     description: "Wait for the specified duration while the SQS queues are impaired"
159 | 
160 |   - name: removeDenyAllPolicyFromQueues
161 |     action: aws:executeScript
162 |     inputs:
163 |       Runtime: python3.11
164 |       Handler: remove_deny_policy
165 |       Script: |
166 |         import boto3
167 | 
168 |         def remove_deny_policy(events, context):
169 |             region = events['region']
170 |             affected_queues = events['affectedQueues']
171 |             
172 |             sqs = boto3.client('sqs', region_name=region)
173 |             results = []
174 |             
175 |             for queue_url in affected_queues:
176 |                 try:
177 |                     sqs.remove_permission(
178 |                         QueueUrl=queue_url,
179 |                         Label='FISTemporaryDeny'
180 |                     )
181 |                     results.append(f"Successfully removed deny policy from {queue_url}")
182 |                 except Exception as e:
183 |                     results.append(f"Failed to remove deny policy from {queue_url}: {str(e)}")
184 |             
185 |             return results
186 |       InputPayload:
187 |         region: "{{ region }}"
188 |         affectedQueues: "{{ applyDenyAllPolicyToQueues.affectedQueues }}"
189 |     outputs:
190 |       - Name: results
191 |         Selector: $.Payload
192 |         Type: StringList
193 |     description: "Remove the deny permission from all affected SQS queues to restore normal operation"
194 |     isEnd: true
195 | 


--------------------------------------------------------------------------------
/aurora-postgres-cluster-loadtest-failover/aurora-postgres-cluster-loadtest-failover-ssm-template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "schemaVersion": "2.2",
  3 |   "description": "Run database load test on Aurora PostgreSQL cluster",
  4 |   "parameters": {
  5 |     "DBHost": {
  6 |       "type": "String",
  7 |       "description": "Database host endpoint",
  8 |       "default": "{{resolve:ssm:/aurora-cluster/endpoint}}"
  9 |     },
 10 |     "DBPort": {
 11 |       "type": "String",
 12 |       "description": "Database port",
 13 |       "default": "5432"
 14 |     },
 15 |     "DBName": {
 16 |       "type": "String",
 17 |       "description": "Database name",
 18 |       "default": "mydb"
 19 |     },
 20 |     "DBUser": {
 21 |       "type": "String",
 22 |       "description": "Database username",
 23 |       "default": "postgres"
 24 |     },
 25 |     "DBPassword": {
 26 |       "type": "String",
 27 |       "description": "Database password",
 28 |       "default": "{{resolve:secretsmanager:aurora-cluster-password:SecretString:password}}"
 29 |     },
 30 |     "NumRecords": {
 31 |       "type": "String",
 32 |       "description": "Number of records to generate",
 33 |       "default": "10000"
 34 |     },
 35 |     "Concurrency": {
 36 |       "type": "String",
 37 |       "description": "Concurrency level for the load test",
 38 |       "default": "5"
 39 |     },
 40 |     "Duration": {
 41 |       "type": "String",
 42 |       "description": "Duration of the load test in seconds",
 43 |       "default": "600"
 44 |     }
 45 |   },
 46 |   "mainSteps": [
 47 |     {
 48 |       "action": "aws:runShellScript",
 49 |       "name": "installDependencies",
 50 |       "inputs": {
 51 |         "runCommand": [
 52 |           "#!/bin/bash",
 53 |           "# Install PostgreSQL client with OS detection",
 54 |           "if [ -f \"/etc/system-release\" ] && grep -i 'Amazon Linux' /etc/system-release; then",
 55 |           "    if ! grep -Fiq 'VERSION_ID=\"2023\"' /etc/os-release; then",
 56 |           "        # Amazon Linux 2 or earlier",
 57 |           "        sudo yum install -y postgresql",
 58 |           "    elif grep -Fiq 'ID=\"amzn\"' /etc/os-release && grep -Fiq 'VERSION_ID=\"2023\"' /etc/os-release; then",
 59 |           "        # Amazon Linux 2023",
 60 |           "        sudo yum install -y postgresql15",
 61 |           "    fi",
 62 |           "elif grep -Fiq 'ID=\"centos\"' /etc/os-release || grep -Fiq 'ID=\"rhel\"' /etc/os-release; then",
 63 |           "    # CentOS/RHEL",
 64 |           "    sudo yum install -y postgresql",
 65 |           "elif grep -Fiq 'ID=ubuntu' /etc/os-release || grep -Fiq 'ID=debian' /etc/os-release; then",
 66 |           "    # Ubuntu/Debian",
 67 |           "    sudo apt-get update && sudo apt-get install -y postgresql-client",
 68 |           "else",
 69 |           "    echo \"Unsupported OS. Please install PostgreSQL client manually.\"",
 70 |           "    exit 1",
 71 |           "fi"
 72 |         ]
 73 |       }
 74 |     },
 75 |     {
 76 |       "action": "aws:runShellScript",
 77 |       "name": "runLoadTest",
 78 |       "inputs": {
 79 |         "timeoutSeconds": "900",
 80 |         "runCommand": [
 81 |           "#!/bin/bash",
 82 |           "# Configuration",
 83 |           "DB_HOST=\"{{ DBHost }}\"",
 84 |           "DB_PORT=\"{{ DBPort }}\"",
 85 |           "DB_NAME=\"{{ DBName }}\"",
 86 |           "DB_USER=\"{{ DBUser }}\"",
 87 |           "DB_PASSWORD=\"{{ DBPassword }}\"",
 88 |           "NUM_RECORDS=\"{{ NumRecords }}\"",
 89 |           "CONCURRENCY=\"{{ Concurrency }}\"",
 90 |           "DURATION=\"{{ Duration }}\"",
 91 |           "",
 92 |           "# Function to execute SQL",
 93 |           "execute_sql() {",
 94 |           "  PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c \"$1\"",
 95 |           "}",
 96 |           "",
 97 |           "# Function to execute SQL and return result",
 98 |           "execute_sql_return() {",
 99 |           "  PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -t -c \"$1\" | tr -d '[:space:]'",
100 |           "}",
101 |           "",
102 |           "# Check connection",
103 |           "echo \"Checking connection to PostgreSQL...\"",
104 |           "if ! execute_sql \"\\conninfo\"; then",
105 |           "  echo \"Failed to connect to PostgreSQL. Please check your connection parameters.\"",
106 |           "  exit 1",
107 |           "fi",
108 |           "",
109 |           "# Create test tables if they don't exist",
110 |           "echo \"Setting up test tables...\"",
111 |           "execute_sql \"",
112 |           "CREATE TABLE IF NOT EXISTS load_test_users (",
113 |           "  id SERIAL PRIMARY KEY,",
114 |           "  username VARCHAR(50) NOT NULL,",
115 |           "  email VARCHAR(100) NOT NULL,",
116 |           "  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,",
117 |           "  status VARCHAR(20) DEFAULT 'active',",
118 |           "  login_count INTEGER DEFAULT 0",
119 |           ");",
120 |           "",
121 |           "CREATE INDEX IF NOT EXISTS idx_load_test_users_username ON load_test_users(username);",
122 |           "CREATE INDEX IF NOT EXISTS idx_load_test_users_email ON load_test_users(email);",
123 |           "CREATE INDEX IF NOT EXISTS idx_load_test_users_status ON load_test_users(status);",
124 |           "",
125 |           "CREATE TABLE IF NOT EXISTS load_test_transactions (",
126 |           "  id SERIAL PRIMARY KEY,",
127 |           "  user_id INTEGER REFERENCES load_test_users(id),",
128 |           "  amount DECIMAL(10,2) NOT NULL,",
129 |           "  transaction_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,",
130 |           "  description TEXT,",
131 |           "  category VARCHAR(50)",
132 |           ");",
133 |           "",
134 |           "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_user_id ON load_test_transactions(user_id);",
135 |           "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_date ON load_test_transactions(transaction_date);",
136 |           "CREATE INDEX IF NOT EXISTS idx_load_test_transactions_category ON load_test_transactions(category);",
137 |           "\"",
138 |           "",
139 |           "# Function to run CPU-intensive queries",
140 |           "run_cpu_intensive_query() {",
141 |           "  local query=\"",
142 |           "  WITH RECURSIVE cpu_load AS (",
143 |           "    SELECT 1 as n, random() as r",
144 |           "    UNION ALL",
145 |           "    SELECT n + 1, random() * r",
146 |           "    FROM cpu_load",
147 |           "    WHERE n < 1000",
148 |           "  ),",
149 |           "  complex_aggregation AS (",
150 |           "    SELECT ",
151 |           "      u.id,",
152 |           "      u.username,",
153 |           "      COUNT(t.id) * SUM(t.amount) / NULLIF(AVG(t.amount), 0) as complex_metric,",
154 |           "      STDDEV(t.amount) as amount_stddev,",
155 |           "      PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY t.amount) as median_amount",
156 |           "    FROM load_test_users u",
157 |           "    JOIN load_test_transactions t ON u.id = t.user_id",
158 |           "    GROUP BY u.id, u.username",
159 |           "  )",
160 |           "  SELECT ca.*, cl.r",
161 |           "  FROM complex_aggregation ca",
162 |           "  CROSS JOIN cpu_load cl",
163 |           "  ORDER BY ca.complex_metric DESC, cl.r",
164 |           "  LIMIT 100;",
165 |           "  \"",
166 |           "  while true; do",
167 |           "    execute_sql \"$query\" > /dev/null 2>&1",
168 |           "    sleep 0.1",
169 |           "  done",
170 |           "}",
171 |           "",
172 |           "# Start CPU-intensive load test",
173 |           "echo \"Starting CPU-intensive load test with concurrency $CONCURRENCY for $DURATION seconds...\"",
174 |           "",
175 |           "# Start worker processes",
176 |           "pids=()",
177 |           "for i in $(seq 1 $CONCURRENCY); do",
178 |           "  run_cpu_intensive_query &",
179 |           "  pids+=($!)",
180 |           "done",
181 |           "",
182 |           "echo \"Load test is running with ${#pids[@]} worker processes.\"",
183 |           "",
184 |           "# Sleep for the specified duration",
185 |           "sleep $DURATION",
186 |           "",
187 |           "# Kill all worker processes",
188 |           "echo \"Stopping load test...\"",
189 |           "for pid in \"${pids[@]}\"; do",
190 |           "  kill -9 $pid 2>/dev/null",
191 |           "done",
192 |           "",
193 |           "echo \"CPU load test completed.\""
194 |         ]
195 |       }
196 |     }
197 |   ]
198 | }
199 | 


--------------------------------------------------------------------------------
/elasticache-redis-primary-node-reboot/elasticache-redis-primary-node-reboot-automation.yaml:
--------------------------------------------------------------------------------
  1 | description: "Simulate ElastiCache Redis primary node reboot to test application resilience"
  2 | schemaVersion: "0.3"
  3 | assumeRole: "{{ AutomationAssumeRole }}"
  4 | parameters:
  5 |   tagKey:
  6 |     type: String
  7 |     description: "Tag key to identify ElastiCache clusters to target"
  8 |     default: "FIS-Ready"
  9 |   tagValue:
 10 |     type: String
 11 |     description: "Tag value to identify ElastiCache clusters to target"
 12 |     default: "True"
 13 |   region:
 14 |     type: String
 15 |     description: "AWS Region of the ElastiCache clusters"
 16 |     default: "{{global:REGION}}"
 17 |   AutomationAssumeRole:
 18 |     type: String
 19 |     description: "IAM role for the automation execution"
 20 |     default: ""
 21 | 
 22 | mainSteps:
 23 |   - name: triggerNodeFailover
 24 |     action: aws:executeScript
 25 |     inputs:
 26 |       Runtime: python3.11
 27 |       Handler: trigger_failover
 28 |       Script: |
 29 |         import boto3
 30 | 
 31 |         def trigger_failover(events, context):
 32 |             region = events["region"]
 33 |             tag_key = events["tagKey"]
 34 |             tag_value = events["tagValue"]
 35 |             
 36 |             elasticache = boto3.client("elasticache", region_name=region)
 37 |             results = []
 38 |             
 39 |             # Get replication groups
 40 |             response = elasticache.describe_replication_groups()
 41 |             
 42 |             for rg in response["ReplicationGroups"]:
 43 |                 if rg["Status"] == "available" and rg.get("AutomaticFailover") == "enabled":
 44 |                     rg_id = rg["ReplicationGroupId"]
 45 |                     
 46 |                     try:
 47 |                         # Check tags
 48 |                         account_id = boto3.client("sts").get_caller_identity()["Account"]
 49 |                         arn = "arn:aws:elasticache:{}:{}:replicationgroup:{}".format(region, account_id, rg_id)
 50 |                         tags_response = elasticache.list_tags_for_resource(ResourceName=arn)
 51 |                         tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagList", [])}
 52 |                         
 53 |                         if tags.get(tag_key) == tag_value:
 54 |                             # Find primary cluster
 55 |                             primary_cluster_id = None
 56 |                             for node_group in rg["NodeGroups"]:
 57 |                                 for member in node_group["NodeGroupMembers"]:
 58 |                                     if member["CurrentRole"] == "primary":
 59 |                                         primary_cluster_id = member["CacheClusterId"]
 60 |                                         break
 61 |                                 if primary_cluster_id:
 62 |                                     break
 63 |                             
 64 |                             if primary_cluster_id:
 65 |                                 # Reboot primary node
 66 |                                 elasticache.reboot_cache_cluster(
 67 |                                     CacheClusterId=primary_cluster_id,
 68 |                                     CacheNodeIdsToReboot=["0001"]
 69 |                                 )
 70 |                                 results.append("SUCCESS: Triggered failover for {} (primary: {})".format(rg_id, primary_cluster_id))
 71 |                             else:
 72 |                                 results.append("ERROR: No primary found for {}".format(rg_id))
 73 |                                 
 74 |                     except Exception as e:
 75 |                         results.append("ERROR: Failed to process {}: {}".format(rg_id, str(e)))
 76 |             
 77 |             return results
 78 |       InputPayload:
 79 |         region: "{{ region }}"
 80 |         tagKey: "{{ tagKey }}"
 81 |         tagValue: "{{ tagValue }}"
 82 |     outputs:
 83 |       - Name: results
 84 |         Selector: $.Payload
 85 |         Type: StringList
 86 |     description: "Trigger node failover by rebooting primary nodes"
 87 | 
 88 |   - name: monitorPrimaryNodeRecovery
 89 |     action: aws:executeScript
 90 |     inputs:
 91 |       Runtime: python3.11
 92 |       Handler: monitor_node
 93 |       Script: |
 94 |         import boto3
 95 |         import time
 96 | 
 97 |         def monitor_node(events, context):
 98 |             region = events["region"]
 99 |             tag_key = events["tagKey"]
100 |             tag_value = events["tagValue"]
101 |             
102 |             elasticache = boto3.client("elasticache", region_name=region)
103 |             results = []
104 |             start_time = time.time()
105 |             
106 |             # Find the primary node that was rebooted
107 |             response = elasticache.describe_replication_groups()
108 |             primary_cluster_id = None
109 |             
110 |             for rg in response["ReplicationGroups"]:
111 |                 if rg["Status"] == "available" and rg.get("AutomaticFailover") == "enabled":
112 |                     rg_id = rg["ReplicationGroupId"]
113 |                     
114 |                     try:
115 |                         # Check tags
116 |                         account_id = boto3.client("sts").get_caller_identity()["Account"]
117 |                         arn = "arn:aws:elasticache:{}:{}:replicationgroup:{}".format(region, account_id, rg_id)
118 |                         tags_response = elasticache.list_tags_for_resource(ResourceName=arn)
119 |                         tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagList", [])}
120 |                         
121 |                         if tags.get(tag_key) == tag_value:
122 |                             # Find primary cluster
123 |                             for node_group in rg["NodeGroups"]:
124 |                                 for member in node_group["NodeGroupMembers"]:
125 |                                     if member["CurrentRole"] == "primary":
126 |                                         primary_cluster_id = member["CacheClusterId"]
127 |                                         break
128 |                                 if primary_cluster_id:
129 |                                     break
130 |                     except Exception as e:
131 |                         continue
132 |             
133 |             if not primary_cluster_id:
134 |                 return ["ERROR: Could not find primary node to monitor"]
135 |             
136 |             # Monitor the primary node status
137 |             max_wait_time = 600  # 10 minutes
138 |             check_interval = 10  # Check every 10 seconds
139 |             
140 |             for attempt in range(max_wait_time // check_interval):
141 |                 try:
142 |                     cluster_response = elasticache.describe_cache_clusters(
143 |                         CacheClusterId=primary_cluster_id,
144 |                         ShowCacheNodeInfo=True
145 |                     )
146 |                     
147 |                     cluster = cluster_response["CacheClusters"][0]
148 |                     cluster_status = cluster["CacheClusterStatus"]
149 |                     
150 |                     if cluster_status == "available":
151 |                         recovery_time = time.time() - start_time
152 |                         results.append("SUCCESS: Primary node {} recovered to Available in {:.1f} seconds".format(primary_cluster_id, recovery_time))
153 |                         break
154 |                     else:
155 |                         elapsed = time.time() - start_time
156 |                         results.append("MONITORING: Primary node {} status: {} after {:.1f}s".format(primary_cluster_id, cluster_status, elapsed))
157 |                         time.sleep(check_interval)
158 |                         
159 |                 except Exception as e:
160 |                     results.append("ERROR: Failed to check primary node {}: {}".format(primary_cluster_id, str(e)))
161 |                     break
162 |             else:
163 |                 results.append("TIMEOUT: Primary node {} did not recover within {} seconds".format(primary_cluster_id, max_wait_time))
164 |             
165 |             return results
166 |       InputPayload:
167 |         region: "{{ region }}"
168 |         tagKey: "{{ tagKey }}"
169 |         tagValue: "{{ tagValue }}"
170 |     outputs:
171 |       - Name: results
172 |         Selector: $.Payload
173 |         Type: StringList
174 |     description: "Monitor primary node recovery until Available status"
175 |     isEnd: true
176 | 


--------------------------------------------------------------------------------
/elasticache-redis-connection-failure/redis-connection-failure-automation.yaml:
--------------------------------------------------------------------------------
  1 | description: "Simulate Redis connection failure by modifying ElastiCache security groups"
  2 | schemaVersion: "0.3"
  3 | assumeRole: "{{ AutomationAssumeRole }}"
  4 | parameters:
  5 |   tagKey:
  6 |     type: String
  7 |     description: "Tag key to identify ElastiCache clusters to target"
  8 |     default: "FIS-Ready"
  9 |   tagValue:
 10 |     type: String
 11 |     description: "Tag value to identify ElastiCache clusters to target"
 12 |     default: "True"
 13 |   duration:
 14 |     type: String
 15 |     description: "Duration of the connection failure in ISO8601 format"
 16 |     default: "PT5M"
 17 |   region:
 18 |     type: String
 19 |     description: "AWS Region of the ElastiCache clusters"
 20 |     default: "{{global:REGION}}"
 21 |   AutomationAssumeRole:
 22 |     type: String
 23 |     description: "IAM role for the automation execution"
 24 |     default: ""
 25 | 
 26 | mainSteps:
 27 |   - name: getTargetClusters
 28 |     action: aws:executeScript
 29 |     inputs:
 30 |       Runtime: python3.11
 31 |       Handler: get_clusters
 32 |       Script: |
 33 |         import boto3
 34 | 
 35 |         def get_clusters(events, context):
 36 |             region = events['region']
 37 |             tag_key = events['tagKey']
 38 |             tag_value = events['tagValue']
 39 |             
 40 |             elasticache = boto3.client('elasticache', region_name=region)
 41 |             target_clusters = []
 42 |             
 43 |             # Get Redis clusters
 44 |             paginator = elasticache.get_paginator('describe_cache_clusters')
 45 |             
 46 |             for page in paginator.paginate():
 47 |                 for cluster in page['CacheClusters']:
 48 |                     if cluster['Engine'] == 'redis':
 49 |                         cluster_id = cluster['CacheClusterId']
 50 |                         
 51 |                         try:
 52 |                             # Get cluster tags
 53 |                             tags_response = elasticache.list_tags_for_resource(
 54 |                                 ResourceName=f"arn:aws:elasticache:{region}:{boto3.client('sts').get_caller_identity()['Account']}:cluster:{cluster_id}"
 55 |                             )
 56 |                             
 57 |                             tags = {tag['Key']: tag['Value'] for tag in tags_response.get('TagList', [])}
 58 |                             
 59 |                             if tags.get(tag_key) == tag_value:
 60 |                                 # Get security groups
 61 |                                 security_groups = cluster.get('SecurityGroups', [])
 62 |                                 if security_groups:
 63 |                                     target_clusters.append({
 64 |                                         'cluster_id': cluster_id,
 65 |                                         'security_groups': [sg['SecurityGroupId'] for sg in security_groups]
 66 |                                     })
 67 |                                         
 68 |                         except Exception as e:
 69 |                             print(f"Error processing cluster {cluster_id}: {str(e)}")
 70 |                             continue
 71 |                             
 72 |             return target_clusters
 73 |       InputPayload:
 74 |         region: "{{ region }}"
 75 |         tagKey: "{{ tagKey }}"
 76 |         tagValue: "{{ tagValue }}"
 77 |     outputs:
 78 |       - Name: targetClusters
 79 |         Selector: $.Payload
 80 |         Type: MapList
 81 |     description: "Find ElastiCache Redis clusters with specified tags"
 82 | 
 83 |   - name: disableRedisConnections
 84 |     action: aws:executeScript
 85 |     onFailure: "step:restoreRedisConnections"
 86 |     onCancel: "step:restoreRedisConnections"
 87 |     inputs:
 88 |       Runtime: python3.11
 89 |       Handler: disable_connections
 90 |       Script: |
 91 |         import boto3
 92 | 
 93 |         def disable_connections(events, context):
 94 |             region = events['region']
 95 |             target_clusters = events['targetClusters']
 96 |             
 97 |             ec2 = boto3.client('ec2', region_name=region)
 98 |             results = []
 99 |             modified_rules = []
100 |             
101 |             for cluster_info in target_clusters:
102 |                 cluster_id = cluster_info['cluster_id']
103 |                 security_groups = cluster_info['security_groups']
104 |                 
105 |                 for sg_id in security_groups:
106 |                     try:
107 |                         # Get current security group rules
108 |                         response = ec2.describe_security_groups(GroupIds=[sg_id])
109 |                         sg = response['SecurityGroups'][0]
110 |                         
111 |                         # Store original inbound rules for Redis port (6379)
112 |                         redis_rules = []
113 |                         for rule in sg['IpPermissions']:
114 |                             if rule.get('FromPort') == 6379 and rule.get('ToPort') == 6379:
115 |                                 redis_rules.append(rule)
116 |                         
117 |                         if redis_rules:
118 |                             # Remove Redis access rules
119 |                             ec2.revoke_security_group_ingress(
120 |                                 GroupId=sg_id,
121 |                                 IpPermissions=redis_rules
122 |                             )
123 |                             
124 |                             modified_rules.append({
125 |                                 'security_group_id': sg_id,
126 |                                 'cluster_id': cluster_id,
127 |                                 'removed_rules': redis_rules
128 |                             })
129 |                             
130 |                             results.append(f"Disabled Redis connections for cluster {cluster_id}, SG {sg_id}")
131 |                         
132 |                     except Exception as e:
133 |                         results.append(f"Failed to modify SG {sg_id} for cluster {cluster_id}: {str(e)}")
134 |             
135 |             return {
136 |                 'modifiedRules': modified_rules,
137 |                 'results': results
138 |             }
139 |       InputPayload:
140 |         region: "{{ region }}"
141 |         targetClusters: "{{ getTargetClusters.targetClusters }}"
142 |     outputs:
143 |       - Name: modifiedRules
144 |         Selector: $.Payload.modifiedRules
145 |         Type: MapList
146 |       - Name: results
147 |         Selector: $.Payload.results
148 |         Type: StringList
149 |     description: "Disable Redis connections by removing security group rules"
150 | 
151 |   - name: waitForDuration
152 |     action: "aws:sleep"
153 |     onFailure: "step:restoreRedisConnections"
154 |     onCancel: "step:restoreRedisConnections"
155 |     inputs:
156 |       Duration: "{{ duration }}"
157 |     description: "Wait for the specified duration while Redis connections are blocked"
158 | 
159 |   - name: restoreRedisConnections
160 |     action: aws:executeScript
161 |     inputs:
162 |       Runtime: python3.11
163 |       Handler: restore_connections
164 |       Script: |
165 |         import boto3
166 | 
167 |         def restore_connections(events, context):
168 |             region = events['region']
169 |             modified_rules = events['modifiedRules']
170 |             
171 |             ec2 = boto3.client('ec2', region_name=region)
172 |             results = []
173 |             
174 |             for rule_info in modified_rules:
175 |                 sg_id = rule_info['security_group_id']
176 |                 cluster_id = rule_info['cluster_id']
177 |                 removed_rules = rule_info['removed_rules']
178 |                 
179 |                 try:
180 |                     # Restore the original rules
181 |                     if removed_rules:
182 |                         ec2.authorize_security_group_ingress(
183 |                             GroupId=sg_id,
184 |                             IpPermissions=removed_rules
185 |                         )
186 |                         results.append(f"Restored Redis connections for cluster {cluster_id}, SG {sg_id}")
187 |                         
188 |                 except Exception as e:
189 |                     results.append(f"Failed to restore SG {sg_id} for cluster {cluster_id}: {str(e)}")
190 |             
191 |             return results
192 |       InputPayload:
193 |         region: "{{ region }}"
194 |         modifiedRules: "{{ disableRedisConnections.modifiedRules }}"
195 |     outputs:
196 |       - Name: results
197 |         Selector: $.Payload
198 |         Type: StringList
199 |     description: "Restore Redis connections by adding back security group rules"
200 |     isEnd: true
201 | 


--------------------------------------------------------------------------------
/ec2-windows-stop-iis/ec2-windows-stop-iis-ssm-template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "schemaVersion": "2.2",
  3 |   "description": "Stop IIS Application Pool for FIS experiment",
  4 |   "parameters": {
  5 |     "DurationSeconds": {
  6 |       "type": "String",
  7 |       "default": "120",
  8 |       "description": "Duration of test in seconds.",
  9 |       "allowedPattern": "([1-9][0-9]{0,4})|(1[0-6][0-9]{4})|(17[0-1][0-9]{3})|(172[0-7][0-9]{2})|(172800)"
 10 |     },
 11 |     "IISAppPoolName": {
 12 |       "type": "String",
 13 |       "default": "DefaultAppPool",
 14 |       "description": "Name of the Windows IIS Application Pool to Stop",
 15 |       "allowedPattern": "^[a-zA-Z0-9\\-_\\.]{1,50}$"
 16 |     }
 17 |   },
 18 |   "mainSteps": [
 19 |     {
 20 |       "action": "aws:runPowerShellScript",
 21 |       "name": "ValidatePrerequisites",
 22 |       "precondition": {
 23 |         "StringEquals": [
 24 |           "platformType",
 25 |           "Windows"
 26 |         ]
 27 |       },
 28 |       "inputs": {
 29 |         "timeoutSeconds": 60,
 30 |         "onFailure": "exit",
 31 |         "runCommand": [
 32 |           "function Write-Log {",
 33 |           "    param($Message)",
 34 |           "    $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'",
 35 |           "    Write-Output \"[$timestamp] $Message\"",
 36 |           "}",
 37 |           "",
 38 |           "try {",
 39 |           "    # Check if IIS modules are installed",
 40 |           "    Write-Log \"Checking if IIS modules are installed...\"",
 41 |           "    $iisModule = Get-Module -ListAvailable -Name WebAdministration",
 42 |           "    if (-not $iisModule) {",
 43 |           "        Write-Log \"ERROR: IIS WebAdministration module is not installed\"",
 44 |           "        Exit 1",
 45 |           "    }",
 46 |           "    Write-Log \"IIS WebAdministration module is installed\"",
 47 |           "",
 48 |           "    # Import the WebAdministration module",
 49 |           "    Import-Module WebAdministration",
 50 |           "",
 51 |           "    # Check if experiment is already running",
 52 |           "    if (Test-Path -Path 'C:\\temp\\fis_windows_iis_experiment.json') {",
 53 |           "        Write-Log \"ERROR: fis_windows_iis_experiment.json already exists. Exiting.\"",
 54 |           "        Exit 1",
 55 |           "    }",
 56 |           "",
 57 |           "    # Create temp directory if it doesn't exist",
 58 |           "    if (-not (Test-Path -Path 'C:\\temp')) {",
 59 |           "        Write-Log \"Creating C:\\temp directory\"",
 60 |           "        New-Item -Path 'C:\\temp' -ItemType Directory -Force | Out-Null",
 61 |           "    }",
 62 |           "",
 63 |           "    # Verify IIS Application Pool exists",
 64 |           "    Write-Log \"Verifying IIS Application Pool: {{IISAppPoolName}}\"",
 65 |           "    $appPool = Get-IISAppPool -Name {{IISAppPoolName}} -ErrorAction SilentlyContinue",
 66 |           "    if (-not $appPool) {",
 67 |           "        Write-Log \"ERROR: Application Pool {{IISAppPoolName}} not found\"",
 68 |           "        Exit 1",
 69 |           "    }",
 70 |           "",
 71 |           "    # Verify IIS Application Pool is in Running state",
 72 |           "    Write-Log \"Checking if Application Pool {{IISAppPoolName}} is running...\"",
 73 |           "    if ($appPool.State -ne \"Started\") {",
 74 |           "        Write-Log \"ERROR: Application Pool {{IISAppPoolName}} is not in 'Started' state. Current state: $($appPool.State)\"",
 75 |           "        Write-Log \"The experiment requires the application pool to be in 'Started' state to proceed.\"",
 76 |           "        Exit 1",
 77 |           "    }",
 78 |           "    Write-Log \"Application Pool {{IISAppPoolName}} is in 'Started' state. Proceeding with experiment.\"",
 79 |           "",
 80 |           "    # Store initial state for idempotency",
 81 |           "    $initialState = @{",
 82 |           "        'AppPoolName' = '{{IISAppPoolName}}'",
 83 |           "        'InitialState' = $appPool.State",
 84 |           "        'StartTime' = (Get-Date).ToString('o')",
 85 |           "        'ExperimentDuration' = {{DurationSeconds}}",
 86 |           "    }",
 87 |           "    $initialState | ConvertTo-Json | Out-File -FilePath 'C:\\temp\\fis_windows_iis_experiment.json'",
 88 |           "    Write-Log \"Prerequisites validated successfully\"",
 89 |           "}",
 90 |           "catch {",
 91 |           "    Write-Log \"ERROR during validation: $($_.Exception.Message)\"",
 92 |           "    Exit 1",
 93 |           "}"
 94 |         ]
 95 |       }
 96 |     },
 97 |     {
 98 |       "action": "aws:runPowerShellScript",
 99 |       "name": "StopIISAppPool",
100 |       "precondition": {
101 |         "StringEquals": [
102 |           "platformType",
103 |           "Windows"
104 |         ]
105 |       },
106 |       "inputs": {
107 |         "timeoutSeconds": 120,
108 |         "onFailure": "exit",
109 |         "runCommand": [
110 |           "function Write-Log {",
111 |           "    param($Message)",
112 |           "    $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'",
113 |           "    Write-Output \"[$timestamp] $Message\"",
114 |           "}",
115 |           "",
116 |           "try {",
117 |           "    # Import the WebAdministration module",
118 |           "    Import-Module WebAdministration",
119 |           "",
120 |           "    # Load experiment data",
121 |           "    $experimentData = Get-Content -Path 'C:\\temp\\fis_windows_iis_experiment.json' | ConvertFrom-Json",
122 |           "    $start_time = Get-Date",
123 |           "",
124 |           "    # Stop the application pool",
125 |           "    Write-Log \"Stopping IIS Application Pool: {{IISAppPoolName}}\"",
126 |           "    $appPool = Get-IISAppPool -Name {{IISAppPoolName}}",
127 |           "    $appPool | Stop-WebAppPool",
128 |           "",
129 |           "    # Verify the app pool is stopped",
130 |           "    $stoppedPool = Get-IISAppPool -Name {{IISAppPoolName}}",
131 |           "    if ($stoppedPool.State -ne 'Stopped') {",
132 |           "        throw \"Failed to stop application pool\"",
133 |           "    }",
134 |           "    Write-Log \"Application Pool stopped successfully\"",
135 |           "",
136 |           "    # Wait for the specified duration",
137 |           "    Write-Log \"Sleeping for {{DurationSeconds}} seconds\"",
138 |           "    Start-Sleep -Seconds {{DurationSeconds}}",
139 |           "}",
140 |           "catch {",
141 |           "    Write-Log \"ERROR during execution: $($_.Exception.Message)\"",
142 |           "    # Attempt to restore the app pool even if there was an error",
143 |           "    try {",
144 |           "        Write-Log \"Attempting to restore IIS Application Pool after error\"",
145 |           "        Start-WebAppPool -Name {{IISAppPoolName}}",
146 |           "    }",
147 |           "    catch {",
148 |           "        Write-Log \"ERROR during emergency restoration: $($_.Exception.Message)\"",
149 |           "    }",
150 |           "    Exit 1",
151 |           "}"
152 |         ]
153 |       }
154 |     },
155 |     {
156 |       "action": "aws:runPowerShellScript",
157 |       "name": "RestoreIISAppPool",
158 |       "precondition": {
159 |         "StringEquals": [
160 |           "platformType",
161 |           "Windows"
162 |         ]
163 |       },
164 |       "inputs": {
165 |         "timeoutSeconds": 120,
166 |         "onFailure": "successAndExit",
167 |         "runCommand": [
168 |           "function Write-Log {",
169 |           "    param($Message)",
170 |           "    $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'",
171 |           "    Write-Output \"[$timestamp] $Message\"",
172 |           "}",
173 |           "",
174 |           "try {",
175 |           "    # Import the WebAdministration module",
176 |           "    Import-Module WebAdministration",
177 |           "",
178 |           "    # Restore the application pool",
179 |           "    Write-Log \"Restoring IIS Application Pool: {{IISAppPoolName}}\"",
180 |           "    Start-WebAppPool -Name {{IISAppPoolName}}",
181 |           "",
182 |           "    # Verify the app pool is started",
183 |           "    $startedPool = Get-IISAppPool -Name {{IISAppPoolName}}",
184 |           "    if ($startedPool.State -ne 'Started') {",
185 |           "        throw \"Failed to start application pool\"",
186 |           "    }",
187 |           "    Write-Log \"Application Pool restored successfully\"",
188 |           "}",
189 |           "catch {",
190 |           "    Write-Log \"ERROR during restoration: $($_.Exception.Message)\"",
191 |           "    throw",
192 |           "}",
193 |           "finally {",
194 |           "    # Cleanup - always remove the experiment file",
195 |           "    try {",
196 |           "        Write-Log \"Cleaning up: Deleting JSON file C:\\temp\\fis_windows_iis_experiment.json\"",
197 |           "        Remove-Item -Path C:\\temp\\fis_windows_iis_experiment.json -Force",
198 |           "        Write-Log \"JSON file deleted successfully\"",
199 |           "    }",
200 |           "    catch {",
201 |           "        Write-Log \"ERROR during cleanup: $($_.Exception.Message)\"",
202 |           "    }",
203 |           "}"
204 |         ]
205 |       }
206 |     }
207 |   ]
208 | }
209 | 


--------------------------------------------------------------------------------
/dynamodb-region-impairment/dynamodb-region-impairment-automation.yaml:
--------------------------------------------------------------------------------
  1 | description: "Block DynamoDB table access by modifying table resource policy"
  2 | schemaVersion: "0.3"
  3 | assumeRole: "{{ AutomationAssumeRole }}"
  4 | parameters:
  5 |   tableName:
  6 |     type: String
  7 |     description: "DynamoDB table name to impair"
  8 |     default: "my-global-table"
  9 |   targetRegion:
 10 |     type: String
 11 |     description: "AWS Region to block DynamoDB access in"
 12 |     default: "us-east-1"
 13 |   duration:
 14 |     type: String
 15 |     description: "Duration of the impairment in ISO8601 format"
 16 |     default: "PT10M"
 17 |   AutomationAssumeRole:
 18 |     type: String
 19 |     description: "IAM role for the automation execution"
 20 |     default: ""
 21 | 
 22 | mainSteps:
 23 |   - name: waitForFISAction
 24 |     action: aws:sleep
 25 |     inputs:
 26 |       Duration: PT10S
 27 |   
 28 |   - name: applyDenyPolicy
 29 |     action: aws:executeScript
 30 |     onFailure: step:cleanupPolicy
 31 |     onCancel: step:cleanupPolicy
 32 |     inputs:
 33 |       Runtime: python3.11
 34 |       Handler: apply_deny_policy
 35 |       Script: |
 36 |         import boto3
 37 |         import json
 38 |         import time
 39 | 
 40 |         def apply_deny_policy(events, context):
 41 |             table_name = events['tableName']
 42 |             target_region = events['targetRegion']
 43 |             
 44 |             # Create DynamoDB client for target region
 45 |             dynamodb = boto3.client('dynamodb', region_name=target_region)
 46 |             
 47 |             # Get current table description
 48 |             response = dynamodb.describe_table(TableName=table_name)
 49 |             table_arn = response['Table']['TableArn']
 50 |             
 51 |             # Check for existing resource policy
 52 |             existing_policy = None
 53 |             try:
 54 |                 policy_response = dynamodb.get_resource_policy(ResourceArn=table_arn)
 55 |                 existing_policy = json.loads(policy_response['Policy'])
 56 |                 print(f"Found existing policy on table {table_name}")
 57 |             except (dynamodb.exceptions.ResourceNotFoundException, dynamodb.exceptions.PolicyNotFoundException):
 58 |                 print(f"No existing policy on table {table_name}")
 59 |             
 60 |             # Create deny statement
 61 |             deny_statement = {
 62 |                 "Sid": "FISDenyAccess",
 63 |                 "Effect": "Deny",
 64 |                 "Principal": "*",
 65 |                 "Action": [
 66 |                     "dynamodb:PutItem",
 67 |                     "dynamodb:GetItem",
 68 |                     "dynamodb:UpdateItem",
 69 |                     "dynamodb:DeleteItem",
 70 |                     "dynamodb:Query",
 71 |                     "dynamodb:Scan",
 72 |                     "dynamodb:BatchGetItem",
 73 |                     "dynamodb:BatchWriteItem"
 74 |                 ],
 75 |                 "Resource": table_arn,
 76 |                 "Condition": {
 77 |                     "StringNotEquals": {
 78 |                         "aws:PrincipalArn": [
 79 |                             "arn:aws:iam::*:role/*SSM*",
 80 |                             "arn:aws:iam::*:role/*FIS*",
 81 |                             "arn:aws:iam::*:role/aws-service-role/replication.dynamodb.amazonaws.com/AWSServiceRoleForDynamoDBReplication"
 82 |                         ]
 83 |                     }
 84 |                 }
 85 |             }
 86 |             
 87 |             # Merge with existing policy or create new one
 88 |             if existing_policy:
 89 |                 new_policy = existing_policy.copy()
 90 |                 new_policy['Statement'].append(deny_statement)
 91 |             else:
 92 |                 new_policy = {
 93 |                     "Version": "2012-10-17",
 94 |                     "Statement": [deny_statement]
 95 |                 }
 96 |             
 97 |             
 98 |             try:
 99 |                 # Apply the merged resource policy with retry logic
100 |                 max_retries = 5
101 |                 for attempt in range(max_retries):
102 |                     try:
103 |                         dynamodb.put_resource_policy(
104 |                             ResourceArn=table_arn,
105 |                             Policy=json.dumps(new_policy)
106 |                         )
107 |                         break
108 |                     except dynamodb.exceptions.ResourceInUseException:
109 |                         if attempt < max_retries - 1:
110 |                             print(f"Table busy, retrying in {2**attempt} seconds...")
111 |                             time.sleep(2**attempt)
112 |                         else:
113 |                             raise
114 |                 
115 |                 print(f"Applied deny policy to table {table_name} in {target_region}")
116 |                 
117 |                 return {
118 |                     "statusCode": 200, 
119 |                     "tableArn": table_arn,
120 |                     "targetRegion": target_region,
121 |                     "existingPolicy": existing_policy,
122 |                     "body": f"Successfully applied deny policy to {table_name}"
123 |                 }
124 |             except Exception as e:
125 |                 print(f"Error applying policy: {str(e)}")
126 |                 raise e
127 |       InputPayload:
128 |         tableName: "{{ tableName }}"
129 |         targetRegion: "{{ targetRegion }}"
130 |     outputs:
131 |       - Name: tableArn
132 |         Selector: $.Payload.tableArn
133 |         Type: String
134 |       - Name: targetRegion
135 |         Selector: $.Payload.targetRegion
136 |         Type: String
137 |       - Name: existingPolicy
138 |         Selector: $.Payload.existingPolicy
139 |         Type: StringMap
140 | 
141 |   - name: waitForDuration
142 |     action: aws:sleep
143 |     inputs:
144 |       Duration: "{{ duration }}"
145 | 
146 |   - name: cleanupPolicy
147 |     action: aws:executeScript
148 |     inputs:
149 |       Runtime: python3.11
150 |       Handler: cleanup_policy
151 |       Script: |
152 |         import boto3
153 |         import json
154 | 
155 |         def cleanup_policy(events, context):
156 |             table_arn = events.get('tableArn')
157 |             target_region = events.get('targetRegion')
158 |             table_name = events.get('tableName')
159 |             existing_policy = events.get('existingPolicy')
160 |             
161 |             # If we don't have tableArn from previous step, try to get it
162 |             if not table_arn and table_name and target_region:
163 |                 dynamodb = boto3.client('dynamodb', region_name=target_region)
164 |                 response = dynamodb.describe_table(TableName=table_name)
165 |                 table_arn = response['Table']['TableArn']
166 |             
167 |             if table_arn and target_region:
168 |                 dynamodb = boto3.client('dynamodb', region_name=target_region)
169 |                 try:
170 |                     # Get current policy to see what's there now
171 |                     current_policy_response = dynamodb.get_resource_policy(ResourceArn=table_arn)
172 |                     current_policy = json.loads(current_policy_response['Policy'])
173 |                     
174 |                     # Remove only our FISDenyAccess statement by SID
175 |                     filtered_statements = [
176 |                         stmt for stmt in current_policy.get('Statement', [])
177 |                         if stmt.get('Sid') != 'FISDenyAccess'
178 |                     ]
179 |                     
180 |                     if filtered_statements:
181 |                         # Keep other statements, remove only ours
182 |                         cleaned_policy = {
183 |                             "Version": current_policy.get("Version", "2012-10-17"),
184 |                             "Statement": filtered_statements
185 |                         }
186 |                         dynamodb.put_resource_policy(
187 |                             ResourceArn=table_arn,
188 |                             Policy=json.dumps(cleaned_policy)
189 |                         )
190 |                         print(f"Removed FISDenyAccess statement, preserved other policies on table {table_arn}")
191 |                     else:
192 |                         # No other statements, delete entire policy
193 |                         dynamodb.delete_resource_policy(ResourceArn=table_arn)
194 |                         print(f"Removed entire policy from table {table_arn}")
195 |                     
196 |                     return {"statusCode": 200, "body": "Successfully cleaned up policy"}
197 |                 except Exception as e:
198 |                     print(f"Error cleaning up policy: {str(e)}")
199 |                     # Don't fail if policy doesn't exist
200 |                     if "ResourceNotFoundException" in str(e) or "PolicyNotFoundException" in str(e):
201 |                         return {"statusCode": 200, "body": "Policy already removed"}
202 |                     raise e
203 |             else:
204 |                 print("No table ARN provided for cleanup")
205 |                 return {"statusCode": 200, "body": "No cleanup needed"}
206 |       InputPayload:
207 |         tableArn: "{{ applyDenyPolicy.tableArn }}"
208 |         targetRegion: "{{ applyDenyPolicy.targetRegion }}"
209 |         tableName: "{{ tableName }}"
210 |         existingPolicy: "{{ applyDenyPolicy.existingPolicy }}"
211 | 


--------------------------------------------------------------------------------