├── App ├── ingress.yaml ├── nginx-bench.yaml ├── nginx.yaml └── service.yaml ├── Argo ├── Jenkinsfile ├── argo-access.yaml ├── argowf-chaos-admin.yaml └── argowf-native-pod-delete.yaml ├── IMG ├── chaos-admin.png ├── chaos-litmus-jenkins.png ├── chaos.png └── reliability_pipeline_chaos_workflows.png ├── README.bkp └── README.md /App/ingress.yaml: -------------------------------------------------------------------------------- 1 | kind: Ingress 2 | apiVersion: extensions/v1beta1 3 | metadata: 4 | annotations: 5 | external-dns.alpha.kubernetes.io/hostname: nginx.youdomain.com 6 | labels: 7 | app: nginx 8 | name: ingress 9 | spec: 10 | rules: 11 | - http: 12 | paths: 13 | - backend: 14 | serviceName: nginx 15 | servicePort: 443 16 | path: /* 17 | 18 | 19 | -------------------------------------------------------------------------------- /App/nginx-bench.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | labels: 5 | app: nginx-bench 6 | generateName: nginx-bench- 7 | spec: 8 | template: 9 | metadata: 10 | labels: 11 | app: nginx-bench 12 | spec: 13 | restartPolicy: Never 14 | containers: 15 | - args: 16 | - -c 17 | - /go/bin/main -r -c10 -t${BENCHMARK_DURATION} -n 10000000 http://${NGINX_SVC_NAME}:${NGINX_PORT_NUM}/; exit 0 18 | command: 19 | - /bin/sh 20 | env: 21 | - name: NGINX_SVC_NAME 22 | value: "nginx.default.svc.cluster.local" 23 | - name: NGINX_PORT_NUM 24 | value: "80" 25 | - name: BENCHMARK_DURATION 26 | value: "300" 27 | image: devth/alpine-bench 28 | imagePullPolicy: Always 29 | name: nginx-bench 30 | -------------------------------------------------------------------------------- /App/nginx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | annotations: 5 | litmuschaos.io/chaos: "true" 6 | labels: 7 | app: nginx 8 | name: nginx 9 | spec: 10 | replicas: 3 11 | selector: 12 | matchLabels: 13 | app: nginx 14 | template: 15 | metadata: 16 | labels: 17 | app: nginx 18 | spec: 19 | containers: 20 | - image: nginx:latest 21 | imagePullPolicy: Always 22 | name: nginx 23 | ports: 24 | - containerPort: 80 25 | protocol: TCP 26 | -------------------------------------------------------------------------------- /App/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: nginx 5 | spec: 6 | ports: 7 | - port: 80 8 | protocol: TCP 9 | targetPort: 80 10 | selector: 11 | app: nginx 12 | type: NodePort 13 | -------------------------------------------------------------------------------- /Argo/Jenkinsfile: -------------------------------------------------------------------------------- 1 | import java.text.SimpleDateFormat 2 | 3 | def l1 = 'dev' 4 | def l2 = 'tools' 5 | def serviceName = 'fmea-test-4' 6 | def serviceRepoName = 'fmea-test-4' 7 | def region = 'usw2' 8 | def iksType = 'ppd' 9 | def appName = "${l1}-${l2}-${serviceName}-${region}-${iksType}" 10 | //def appName = 'fmea-test-4' 11 | def deployable_branches = ["fmea1"] 12 | def ptNameVersion = "${serviceName}-${UUID.randomUUID().toString().toLowerCase()}" 13 | def repo = "dev/tools/fmea-test-4/service" 14 | def tag = "" 15 | def registry = "docker.com" 16 | def image = "${repo}/${serviceName}" 17 | def preprodOnly = true 18 | def git_repo = "" 19 | def buildURL ="" 20 | // ****** Load Test default values ******* 21 | def noOfPods = "2" 22 | def noOfUsers = "20" 23 | def durationInMin = "2" 24 | def healthUrlDefault = "https://test.com/health/full" 25 | def pfi_namespace = "test-namespace" 26 | def waitForUserInputMins = 2 27 | def perfJsonDefault = "defaultParams.json" 28 | def karateEnvDefault = "prf" 29 | def peakTPSDefault = "1" 30 | def rampupTimeDefault = "2" 31 | def steadyStateTimeDefault = "7" 32 | def baseurl = "https://test.com" 33 | def Custom = "FMEA1" 34 | // ********************************************** 35 | 36 | def clusterMap = [:] 37 | clusterMap["pfi"] = "https://api.us-west-2.elb.amazonaws.com" 38 | 39 | properties([ 40 | /* 41 | daysToKeepStr: history is only kept up to this days. 42 | numToKeepStr: only this number of build logs are kept. 43 | artifactDaysToKeepStr: artifacts are only kept up to this days. 44 | artifactNumToKeepStr: only this number of builds have their artifacts kept. 45 | */ 46 | buildDiscarder(logRotator(daysToKeepStr:'', numToKeepStr: '10', artifactDaysToKeepStr: '', artifactNumToKeepStr: '')) 47 | ]) 48 | 49 | podTemplate(name: ptNameVersion, label: ptNameVersion, containers: [ 50 | //containerTemplate(name: 'docker', image: 'docker.com/dev/build/ibp/jnlp-slave-with-docker:18.03.0', ttyEnabled: true, command: 'cat', args: ''), 51 | containerTemplate(name: 'cdtools', image: 'docker.com/dev/deploy/cd/argocd-utils:stable', alwaysPullImage: true, ttyEnabled: true, command: 'cat', args: ''), 52 | containerTemplate(name: 'argo', image: 'docker.com/argoproj/argocli:v2.2.1', alwaysPullImage: true, ttyEnabled: true, command: 'cat', args: ''), 53 | containerTemplate(name: 'aws-s3', image: 'docker.com/dev/build/ibp/jnlp-slave-with-docker:3.26-1_jenkins-2-138-update_3', alwaysPullImage: true, ttyEnabled: true, command: 'cat', args: '') 54 | ], 55 | volumes: [hostPathVolume(hostPath: '/var/run/dind/docker.sock', mountPath: '/var/run/docker.sock')], 56 | 57 | ) 58 | 59 | { 60 | try { 61 | // DO NOT CHANGE 62 | def isPR = env.CHANGE_ID != null 63 | def branch = env.CHANGE_ID != null ? env.CHANGE_TARGET : env.BRANCH_NAME 64 | def dateFormat = new SimpleDateFormat("yyyyMMddHHmm") 65 | def date = new Date() 66 | def date_tag = dateFormat.format(date) 67 | 68 | node(ptNameVersion) { 69 | // DO NOT CHANGE 70 | def scmInfo = checkout scm 71 | println("printing scmInfo:") 72 | println("********URL***********") 73 | println(scmInfo.GIT_URL) 74 | buildURL = env.BUILD_URL 75 | git_repo = scmInfo.GIT_URL 76 | println("********URL***********") 77 | def shortCommit = "${scmInfo.GIT_COMMIT}"[0..6] 78 | tag = "${env.BUILD_TAG}-${shortCommit}" 79 | def hasReleaseTag = sh(returnStdout: true, script: 'git tag --points-at HEAD').trim().startsWith('release-') 80 | 81 | stage("Chaos Testing ") { 82 | boolean failed = false; 83 | try { 84 | withCredentials([file(credentialsId: 'ARGO_CHAOS', variable: 'ARGO_CHAOS')]) { 85 | container('argo') { 86 | println("invoking argo ${appName}") 87 | sh """#!/bin/sh -xe 88 | ls -al 89 | mkdir \${HOME}/.kube 90 | cp \${ARGO_CHAOS} \${HOME}/.kube/config 91 | """ 92 | sh("argo list --kubeconfig \${HOME}/.kube/config") 93 | sh("argo submit Argo/argowf-chaos-admin.yaml -pappNamespace=test-namespace -pappLabel=fmea-test-4 --watch") 94 | sh("argo list") 95 | } 96 | } 97 | } catch (err) { 98 | failed = true; 99 | } finally { 100 | processStatus(failed, "Test_${envName}", envName) 101 | } 102 | } 103 | 104 | } // node 105 | if (preprodOnly || isPR) { 106 | echo "Preprod or PR build, not going to try Stage or Prod" 107 | currentBuild.result = 'SUCCESS' 108 | return 109 | } 110 | } catch (e) { 111 | echo "Caught error during pipeline: ${e}" 112 | throw e 113 | } finally { 114 | echo "Current build result = ${currentBuild.result}" 115 | 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /Argo/argo-access.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: argo-chaos 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: ClusterRole 8 | metadata: 9 | name: chaos-cluster-role 10 | rules: 11 | - apiGroups: 12 | - '*' 13 | resources: 14 | - '*' 15 | verbs: 16 | - '*' 17 | --- 18 | apiVersion: rbac.authorization.k8s.io/v1beta1 19 | kind: ClusterRoleBinding 20 | metadata: 21 | name: chaos-cluster-role-binding 22 | roleRef: 23 | apiGroup: rbac.authorization.k8s.io 24 | kind: ClusterRole 25 | name: chaos-cluster-role 26 | subjects: 27 | - kind: ServiceAccount 28 | name: argo-chaos 29 | namespace: litmus 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /Argo/argowf-chaos-admin.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: argowf-chaos- 5 | spec: 6 | entrypoint: argowf-chaos 7 | serviceAccountName: argo-chaos 8 | arguments: 9 | parameters: 10 | - name: appNamespace 11 | value: "default" 12 | - name: adminModeNamespace 13 | value: "litmus" 14 | - name: appLabel 15 | value: "nginx" 16 | - name: fileName 17 | value: "pod-app-kill-count.json" 18 | templates: 19 | - name: argowf-chaos 20 | steps: 21 | - - name: run-chaos 22 | template: run-chaos 23 | - name: run-benchmark 24 | template: run-benchmark 25 | - - name: revert-chaos 26 | template: revert-chaos 27 | 28 | - name: run-chaos 29 | inputs: 30 | artifacts: 31 | - name: run-chaos 32 | path: /tmp/chaosengine.yaml 33 | raw: 34 | data: | 35 | apiVersion: litmuschaos.io/v1alpha1 36 | kind: ChaosEngine 37 | metadata: 38 | name: nginx-chaos 39 | namespace: {{workflow.parameters.adminModeNamespace}} 40 | spec: 41 | appinfo: 42 | appns: {{workflow.parameters.appNamespace}} 43 | applabel: "app={{workflow.parameters.appLabel}}" 44 | appkind: deployment 45 | jobCleanUpPolicy: retain 46 | monitoring: false 47 | annotationCheck: 'false' 48 | engineState: 'active' 49 | chaosServiceAccount: litmus-admin 50 | experiments: 51 | - name: k8-pod-delete 52 | spec: 53 | components: 54 | env: 55 | - name: NAME_SPACE 56 | value: {{workflow.parameters.appNamespace}} 57 | - name: LABEL_NAME 58 | value: {{workflow.parameters.appLabel}} 59 | - name: FILE 60 | value: {{workflow.parameters.fileName}} 61 | - name: REPORT 62 | value: 'true' 63 | - name: REPORT_ENDPOINT 64 | value: 'none' 65 | container: 66 | image: lachlanevenson/k8s-kubectl 67 | command: [sh, -c] 68 | args: ['kubectl apply -f /tmp/chaosengine.yaml -n {{workflow.parameters.adminModeNamespace}} | echo "sleeping for 120s" | sleep 120 '] 69 | 70 | - name: run-benchmark 71 | inputs: 72 | artifacts: 73 | - name: run-benchmark 74 | path: /tmp/bench.yaml 75 | raw: 76 | data: | 77 | apiVersion: batch/v1 78 | kind: Job 79 | metadata: 80 | labels: 81 | app: nginx-bench 82 | name: nginx-bench 83 | spec: 84 | template: 85 | metadata: 86 | labels: 87 | app: nginx-bench 88 | spec: 89 | restartPolicy: Never 90 | containers: 91 | - args: 92 | - -c 93 | - /go/bin/main -r -c10 -t${BENCHMARK_DURATION} -n 10000000 http://${NGINX_SVC_NAME}:${NGINX_PORT_NUM}/; exit 0 94 | command: 95 | - /bin/sh 96 | env: 97 | - name: NGINX_SVC_NAME 98 | value: "nginx.default.svc.cluster.local" 99 | - name: NGINX_PORT_NUM 100 | value: "80" 101 | - name: BENCHMARK_DURATION 102 | value: "300" 103 | image: devth/alpine-bench 104 | imagePullPolicy: Always 105 | name: nginx-bench 106 | container: 107 | image: lachlanevenson/k8s-kubectl 108 | command: [sh, -c] 109 | args: ['kubectl apply -f /tmp/bench.yaml -n {{workflow.parameters.appNamespace}}'] 110 | 111 | - name: revert-chaos 112 | container: 113 | image: lachlanevenson/k8s-kubectl 114 | command: [sh, -c] 115 | args: [' sleep 20 | kubectl delete chaosengine nginx-chaos -n {{workflow.parameters.adminModeNamespace}}'] 116 | 117 | -------------------------------------------------------------------------------- /Argo/argowf-native-pod-delete.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: argowf-chaos- 5 | spec: 6 | entrypoint: argowf-chaos 7 | serviceAccountName: argo-chaos 8 | arguments: 9 | parameters: 10 | - name: appNamespace 11 | value: "default" 12 | - name: adminModeNamespace 13 | value: "litmus" 14 | - name: appLabel 15 | value: "nginx" 16 | templates: 17 | - name: argowf-chaos 18 | steps: 19 | - - name: run-benchmark 20 | template: run-benchmark 21 | - name: run-chaos 22 | template: run-chaos 23 | - - name: revert-chaos 24 | template: revert-chaos 25 | 26 | - name: run-chaos 27 | inputs: 28 | artifacts: 29 | - name: run-chaos 30 | path: /tmp/chaosengine.yaml 31 | raw: 32 | data: | 33 | apiVersion: litmuschaos.io/v1alpha1 34 | kind: ChaosEngine 35 | metadata: 36 | name: nginx-chaos 37 | namespace: {{workflow.parameters.adminModeNamespace}} 38 | spec: 39 | appinfo: 40 | appns: {{workflow.parameters.appNamespace}} 41 | applabel: "app={{workflow.parameters.appLabel}}" 42 | appkind: deployment 43 | jobCleanUpPolicy: retain 44 | monitoring: false 45 | annotationCheck: 'false' 46 | engineState: 'active' 47 | chaosServiceAccount: litmus-admin 48 | experiments: 49 | - name: pod-delete 50 | spec: 51 | components: 52 | env: 53 | - name: TOTAL_CHAOS_DURATION 54 | value: "10" 55 | - name: CHAOS_INTERVAL 56 | value: "10" 57 | - name: FORCE 58 | value: "false" 59 | container: 60 | image: lachlanevenson/k8s-kubectl 61 | command: [sh, -c] 62 | args: ['kubectl apply -f /tmp/chaosengine.yaml -n {{workflow.parameters.adminModeNamespace}} | echo "sleeping for 120s" | sleep 120 '] 63 | 64 | - name: run-benchmark 65 | inputs: 66 | artifacts: 67 | - name: run-benchmark 68 | path: /tmp/bench.yaml 69 | raw: 70 | data: | 71 | apiVersion: batch/v1 72 | kind: Job 73 | metadata: 74 | labels: 75 | app: nginx-bench 76 | generateName: nginx-bench- 77 | spec: 78 | template: 79 | metadata: 80 | labels: 81 | app: nginx-bench 82 | spec: 83 | restartPolicy: Never 84 | containers: 85 | - args: 86 | - -c 87 | - /go/bin/main -r -c10 -t${BENCHMARK_DURATION} -n 10000000 http://${NGINX_SVC_NAME}:${NGINX_PORT_NUM}/; exit 0 88 | command: 89 | - /bin/sh 90 | env: 91 | - name: NGINX_SVC_NAME 92 | value: "nginx.default.svc.cluster.local" 93 | - name: NGINX_PORT_NUM 94 | value: "80" 95 | - name: BENCHMARK_DURATION 96 | value: "300" 97 | image: devth/alpine-bench 98 | imagePullPolicy: Always 99 | name: nginx-bench 100 | container: 101 | image: lachlanevenson/k8s-kubectl 102 | command: [sh, -c] 103 | args: ['kubectl create -f /tmp/bench.yaml -n {{workflow.parameters.appNamespace}}'] 104 | 105 | - name: revert-chaos 106 | container: 107 | image: lachlanevenson/k8s-kubectl 108 | command: [sh, -c] 109 | args: [' sleep 20 | kubectl delete chaosengine nginx-chaos -n {{workflow.parameters.adminModeNamespace}}'] 110 | 111 | -------------------------------------------------------------------------------- /IMG/chaos-admin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litmuschaos/chaos-workflows/c47bc033f1e8dda398e87a0dffc0546339c3933e/IMG/chaos-admin.png -------------------------------------------------------------------------------- /IMG/chaos-litmus-jenkins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litmuschaos/chaos-workflows/c47bc033f1e8dda398e87a0dffc0546339c3933e/IMG/chaos-litmus-jenkins.png -------------------------------------------------------------------------------- /IMG/chaos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litmuschaos/chaos-workflows/c47bc033f1e8dda398e87a0dffc0546339c3933e/IMG/chaos.png -------------------------------------------------------------------------------- /IMG/reliability_pipeline_chaos_workflows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litmuschaos/chaos-workflows/c47bc033f1e8dda398e87a0dffc0546339c3933e/IMG/reliability_pipeline_chaos_workflows.png -------------------------------------------------------------------------------- /README.bkp: -------------------------------------------------------------------------------- 1 | # chaos-workflows 2 | Argo integration with LitmusChaos to create Chaos Workflows 3 | 4 | # Pre Req 5 | * We assume you are using kubernetes cluster and have access to namespace with right permission 6 | 7 | ## Argo Work Flow 8 | * Please refer the link `https://github.com/argoproj/argo/tree/master/manifests` for Argo installtion 9 | ### Validation 10 | * Validate the Argo installation via below \ 11 | * export KUBECONFIG `export KUBECONFIG=/Users/<>/.kube/` 12 | * Verify CRDS - `kubectl get crds | grep argo` 13 | * Validate api-resource created - `kubectl api-resources | grep argo` 14 | 15 | 16 | ## Litmus 17 | * Please refer the link `https://github.com/litmuschaos/chaos-operator/blob/master/README.md` for Litmus installation 18 | 19 | ### Validation 20 | * Validate the litmus installation via below 21 | * export KUBECONFIG `export KUBECONFIG=/Users/<>/.kube/` 22 | * Validate opertor - `kubectl get pods -n litmus` 23 | * Verify CRDS - `kubectl get crds | grep chaos` 24 | * Validate api-resource created - `kubectl api-resources | grep litmus` 25 | 26 | 27 | # Application 28 | * Go to App folder and install nginx, service and ingress based of your kubernetes setup 29 | * `kubectl apply -f nginx.yaml` 30 | * `kubectl apply -f service.yaml` [ Optional] 31 | * `kubectl apply -f ingress.yaml` [ Optional] 32 | 33 | # Chaos 34 | 35 | ## ChaosToolkit Experiment 36 | * Please refer the link to install the chaostoolkit experiment `https://hub.litmuschaos.io/charts/chaostoolkit` 37 | * Detail steps are here `https://github.com/litmuschaos/chaos-charts/blob/master/charts/chaostoolkit/Readme.md` 38 | * You have to checkout the chaos-charts `https://github.com/litmuschaos/chaos-charts/tree/master/charts/chaostoolkit` 39 | * After you have the code execute below 40 | 41 | ## In Namespace Changes - Service use case 42 | * This use case assume you want to execute the chaos experiment in same namespace 43 | * Apply experiments for kubernetes - `kubectl apply -f experiments.yaml` 44 | * Validate the experiments for kubernetes - `kubectl get chaosexperiment` 45 | * Setup RBAC - for pod delete RBAC - `kubectl apply -f rbac.yaml` 46 | * Create pod Experiment - for health experiment -`kubectl create -f engine.yaml` 47 | * Validate experiment - `kubectl get pods -w` 48 | * Validate logs - `kubectl logs -f ` 49 | * Clean up chaosexperiment -`kubectl delete -f engine.yaml` 50 | * Clean up rbac -`kubectl delete -f rbac.yaml` 51 | 52 | 53 | ## Remote namespace - Admin use case 54 | * This use case assume you want to execute the chaos pod in different namespace 55 | * Apply experiments for K8 - `kubectl apply -f experiments.yaml` 56 | * Validate the experiments for k8 - `kubectl get chaosexperiments` 57 | * Setup RBAC as admin mode - `kubectl apply -f rbac-admin.yaml` 58 | * Create pod Experiment - for health experiment -`kubectl create -f engine-kiam.yaml` 59 | * Validate experiment - `kubectl get pods -w` 60 | * Validate logs - `kubectl logs -f ` 61 | * Clean up chaosexperiment -`kubectl delete -f engine-kiam.yaml` 62 | * Clean up rbac -`kubectl delete -f rbac-admin.yaml` 63 | 64 | 65 | # Argo 66 | 67 | * Please install argo cmdline `https://github.com/argoproj/homebrew-tap` and mentioned argo setup on Pre-req 68 | * Setup the argo user service account, cluster role and role binding 69 | * Go to Argo folder and execute `kubectl apply -f argo-access.yaml` 70 | * Validate the user exists `kubectl get sa` 71 | 72 | ## Argo Execution 73 | * Validate you have argo setup ` argo list` 74 | 75 | ### In Namespace Changes - Service use case 76 | * Argo submit will bring to state, it will add role and delete it after the execution 77 | * You can pass command line arguments to override values 78 | * Execute `argo submit argo-chaos.yaml` 79 | * Validate via `argo watch ` 80 | ![Argo with service](IMG/chaos.png) 81 | 82 | ### Remote namespace - Admin use case 83 | * Argo submit will bring to state, it will add role and delete it after the execution 84 | * You can pass command line arguments to override values 85 | * Execute `argo submit argo-chaos-admin.yaml` 86 | * Validate via `argo watch ` 87 | ![Argo with admin](IMG/chaos-admin.png) 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chaos-Workflows 2 | 3 | Chaos Workflows are a set of actions strung together to achieve desired chaos impact on a Kubernetes cluster. Workflows 4 | are an effective mechanism to simulate real world conditions & gauge application behaviour in an effective manner. 5 | Take these usecases for example: 6 | 7 | 8 | - Most often, failures do not occur as isolated, single instances. There maybe underlying chronic application-specific or 9 | deploy-environment induced conditions on the cluster when other failures occur, effectively causing a multi-component/complex 10 | failure. Such as, pod failures occurring when other nodes are in sub-optimal or unschedulable states. 11 | 12 | - Running chaos under highly loaded conditions. The parallel actions of a benchmark run on an app deployment and staggered 13 | chaos during this run is highly instructive of the performance characteristic & deployment sanity of the application. 14 | 15 | Workflows are also useful in automating a series of pre-conditioning/setup actions necessary to be performed before triggering 16 | chaos. 17 | 18 | LitmusChaos leverages the popular workflow & GitOps tool [Argo](https://argoproj.github.io/) to achieve this. Argo facilitates 19 | creation of a whole lot of chaos workflow models while being extremely simple & efficient to use. 20 | 21 | This repository hosts predefined workflows based on LitmusChaos experiments you can pick for use, while also the dev/usage docs 22 | that explain the procedure to construct your own chaos workflows. 23 | 24 | You can refer to the following presentation made by [Michael Knyazev](https://www.linkedin.com/public-profile/in/mikhailkniazev) during the Chaos Carnival 2021 about 25 | how Chaos Workflows can be used in constructing Reliability Pipelines on Amazon EKS. 26 | 27 | [![Building Reliability Pipelines with Chaos Workflows](IMG/reliability_pipeline_chaos_workflows.png)](https://www.youtube.com/watch?v=7yBFgqUo01E "Building Reliability Pipelines with Chaos Workflows") 28 | 29 | ## Getting Started 30 | 31 | The subsequent section explains how to get started with a simple chaos workflow that disrupts (via pod-delete chaos) a multi-replica 32 | nginx deployment while a load generator generates benchmark traffic against it. The typical usecase for such a chaos workflow as this 33 | is to be able to observe the extent of degradation in completed requests & request rate (resiliency & perf indicators). 34 | 35 | Users can play around the benchmark as well as chaos parameters as part of a detailed experiment to understand application behaviour 36 | & fix the bugs/deployment issues and arrive at achievable SLAs. 37 | 38 | ### Install Argo Workflow Infrastructure 39 | 40 | The Argo workflow infra consists of the Argo workflow CRDs, Workflow Controller, associated RBAC & Argo CLI. The steps 41 | shown below installs argo in the standard cluster-wide mode wherein the workflow controller operates on all 42 | namespaces. Ensure that you have the right permission to be able to create the said resources. 43 | 44 | If you would like to run argo with a namespace scope, refer to [this](https://github.com/argoproj/argo/blob/master/manifests/namespace-install.yaml) manifest. 45 | 46 | - Create argo namespace 47 | 48 | ``` 49 | root@demo:~/chaos-workflows# kubectl create ns argo 50 | namespace/argo created 51 | ``` 52 | 53 | - Create the CRDs, workflow controller deployment with associated RBAC 54 | 55 | ``` 56 | root@demo:~/chaos-workflows# kubectl apply -f https://raw.githubusercontent.com/argoproj/argo/stable/manifests/install.yaml 57 | 58 | customresourcedefinition.apiextensions.k8s.io/clusterworkflowtemplates.argoproj.io created 59 | customresourcedefinition.apiextensions.k8s.io/cronworkflows.argoproj.io created 60 | customresourcedefinition.apiextensions.k8s.io/workflows.argoproj.io created 61 | customresourcedefinition.apiextensions.k8s.io/workflowtemplates.argoproj.io created 62 | serviceaccount/argo created 63 | serviceaccount/argo-server created 64 | role.rbac.authorization.k8s.io/argo-role created 65 | clusterrole.rbac.authorization.k8s.io/argo-aggregate-to-admin configured 66 | clusterrole.rbac.authorization.k8s.io/argo-aggregate-to-edit configured 67 | clusterrole.rbac.authorization.k8s.io/argo-aggregate-to-view configured 68 | clusterrole.rbac.authorization.k8s.io/argo-cluster-role configured 69 | clusterrole.rbac.authorization.k8s.io/argo-server-cluster-role configured 70 | rolebinding.rbac.authorization.k8s.io/argo-binding created 71 | clusterrolebinding.rbac.authorization.k8s.io/argo-binding unchanged 72 | clusterrolebinding.rbac.authorization.k8s.io/argo-server-binding unchanged 73 | configmap/workflow-controller-configmap created 74 | service/argo-server created 75 | service/workflow-controller-metrics created 76 | deployment.apps/argo-server created 77 | deployment.apps/workflow-controller created 78 | ``` 79 | 80 | - Verify successful creation of argo resources 81 | 82 | ``` 83 | root@demo:~/chaos-workflows# kubectl get crds | grep argo 84 | 85 | clusterworkflowtemplates.argoproj.io 2020-05-15T03:01:31Z 86 | cronworkflows.argoproj.io 2020-05-15T03:01:31Z 87 | workflows.argoproj.io 2020-05-15T03:01:31Z 88 | workflowtemplates.argoproj.io 2020-05-15T03:01:31Z 89 | ``` 90 | 91 | ``` 92 | root@demo:~/chaos-workflows# kubectl api-resources | grep argo 93 | 94 | clusterworkflowtemplates clusterwftmpl,cwft argoproj.io false ClusterWorkflowTemplate 95 | cronworkflows cronwf,cwf argoproj.io true CronWorkflow 96 | workflows wf argoproj.io true Workflow 97 | workflowtemplates wftmpl argoproj.io true WorkflowTemplate 98 | ``` 99 | 100 | ``` 101 | root@demo:~/chaos-workflows# kubectl get pods -n argo 102 | NAME READY STATUS RESTARTS AGE 103 | 104 | argo-server-65cbb4874c-cbq2h 0/1 Running 0 12s 105 | workflow-controller-55bffbdbfd-c4jdf 1/1 Running 0 12s 106 | ``` 107 | 108 | - Install the argo CLI on the harness/test machine (where the kubeconfig is available) 109 | 110 | ``` 111 | root@demo:~# curl -sLO https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 112 | 113 | root@demo:~# chmod +x argo-linux-amd64 114 | 115 | root@demo:~# mv ./argo-linux-amd64 /usr/local/bin/argo 116 | 117 | root@demo:~# argo version 118 | argo: v2.8.0 119 | BuildDate: 2020-05-11T22:55:16Z 120 | GitCommit: 8f696174746ed01b9bf1941ad03da62d312df641 121 | GitTreeState: clean 122 | GitTag: v2.8.0 123 | GoVersion: go1.13.4 124 | Compiler: gc 125 | Platform: linux/amd64 126 | 127 | ``` 128 | 129 | ### Install Litmus Infrastructure 130 | 131 | Refer to the LitmusChaos [documentation](https://docs.litmuschaos.io) to get started on installing Litmus infra on your 132 | Kubernetes clusters. In this example, we will use the [admin mode](https://docs.litmuschaos.io/docs/admin-mode/) of execution where 133 | all chaos resources will be created in the centralized namespace, litmus. 134 | 135 | 136 | ### Install a Sample Application: Nginx 137 | 138 | - Install a simple multi-replica stateless nginx deployment with service exposed over nodeport 139 | 140 | ``` 141 | root@demo:~# kubectl apply -f https://raw.githubusercontent.com/litmuschaos/chaos-workflows/master/App/nginx.yaml 142 | 143 | deployment.extensions/nginx created 144 | ``` 145 | 146 | ``` 147 | root@demo:~# kubectl apply -f https://raw.githubusercontent.com/litmuschaos/chaos-workflows/master/App/service.yaml 148 | 149 | service/nginx created 150 | ``` 151 | 152 | You can access this service over `https://:` 153 | 154 | ### Create the Argo Access ServiceAccount 155 | 156 | - Create the service account and associated RBAC which will be used by the Argo workflow controller to execute the 157 | actions specified in the workflow. In our case, this corresponds to the launch of the nginx benchmark job, creating 158 | the chaosengine to trigger the pod-delete chaos action. In our example, we place it in the namespace where the litmus 159 | chaos resources reside. 160 | 161 | ``` 162 | root@demo:~# kubectl apply -f https://raw.githubusercontent.com/litmuschaos/chaos-workflows/master/Argo/argo-access.yaml -n litmus 163 | 164 | serviceaccount/argo-chaos created 165 | clusterrole.rbac.authorization.k8s.io/chaos-cluster-role created 166 | clusterrolebinding.rbac.authorization.k8s.io/chaos-cluster-role-binding created 167 | ``` 168 | 169 | ### Create the Pod-Delete ChaosExperiment CR 170 | 171 | - Create the pod-delete chaosexperiment custom resource in litmus namespace. This example makes use of the [chaostoolkit chart](https://github.com/litmuschaos/chaos-charts/tree/master/charts/chaostoolkit) as the means to execute the chaos. 172 | 173 | ``` 174 | root@demo:~# kubectl apply -f https://hub.litmuschaos.io/api/chaos/master?file=charts/chaostoolkit/k8-pod-delete/experiment.yaml -n litmus 175 | chaosexperiment.litmuschaos.io/k8-pod-delete created 176 | ``` 177 | 178 | ``` 179 | root@demo:~# kubectl get chaosexperiments -n litmus 180 | NAME AGE 181 | k8-pod-delete 13s 182 | ``` 183 | 184 | ### Create the Chaos Workflow 185 | 186 | - Applying the workflow manifest performs the following actions in parallel: 187 | 188 | - Starts a nginx benchmark job for specified duration (60s) 189 | - Triggers a random pod-kill of the nginx replicas by creating the chaosengine CR. Cleans up after chaos. 190 | 191 | 192 | ``` 193 | root@demo:~# argo submit https://raw.githubusercontent.com/litmuschaos/chaos-workflows/master/Argo/argowf-chaos-admin.yaml -n litmus 194 | Name: argowf-chaos-sl2cn 195 | Namespace: litmus 196 | ServiceAccount: argo-chaos 197 | Status: Pending 198 | Created: Fri May 15 15:31:45 +0000 (now) 199 | Parameters: 200 | appNamespace: default 201 | adminModeNamespace: litmus 202 | appLabel: nginx 203 | fileName: pod-app-kill-count.json 204 | ``` 205 | 206 | ### Visualize the Chaos Workflow 207 | 208 | - You can visualize the progress of the chaos workflow via the Argo UI. Convert the argo-server service to type NodePort & 209 | view the dashboard at `https://:` 210 | 211 | ``` 212 | root@demo:~# kubectl patch svc argo-server -n argo -p '{"spec": {"type": "NodePort"}}' 213 | service/argo-server patched 214 | ``` 215 | 216 | ![image](https://user-images.githubusercontent.com/21166217/82098260-38738b00-9722-11ea-81b4-b3c466a60080.png) 217 | 218 | 219 | ### Running Chaos experiments from a Jenkins pipeline 220 | 221 | - setting up jenkins job and triggering the argo workflow from Jenkins. This will have the benefits of running the tests as a pipeline, schedule it store the results, run as downstream etc. 222 | 223 | - Steps 224 | 1. Create the KUBECONFIG using the token from the `argo-chaos` service account 225 | 2. Connect to namespace where you created the argo-chaos service account and execute below command to get the token 226 | ``` 227 | APISERVER=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}') 228 | SECRET_NAME=$(kubectl get serviceaccount $SERVICE_ACCOUNT -o jsonpath='{.secrets[0].name}') 229 | TOKEN=$(kubectl get secret $SECRET_NAME -o jsonpath='{.data.token}' | base64 --decode) 230 | ``` 231 | 3. Create the KUBECONFIG using the above token 232 | ``` 233 | apiVersion: v1 234 | clusters: 235 | - cluster: 236 | certificate-authority-data: asjdsajkdsajdhkjsadhak== 237 | server: https://api-abcdef-ppd-usw2-12345678.us-west-2.elb.amazonaws.com 238 | name: abcdef-ppd-usw2.cluster.k8s.local 239 | contexts: 240 | - context: 241 | cluster: 'abcdef-ppd-usw2.cluster.k8s.local' 242 | namespace: 'abcdef-perf-infra-usw2-ppd-pfi' 243 | user: 'argo-chaos' 244 | name: service-account 245 | current-context: service-account 246 | kind: Config 247 | preferences: {} 248 | users: 249 | - name: 'argo-chaos' 250 | user: 251 | token: '' 252 | ``` 253 | 4. Upload the KUBECONFIG file in `credentials` section in Jenkins as `secret file` - refer below screenshot 254 | 5. Setup the Jenkins job and refer the `Jenkinsfile` 255 | 6. Once all setup complete you can trigger the Jenkins job 256 | ![Jenkins Pipeline argo](IMG/chaos-litmus-jenkins.png) 257 | --------------------------------------------------------------------------------