├── .github ├── PULL_REQUEST_TEMPLATE.md ├── release-template.md ├── release.yml └── workflows │ ├── docker-image.yml │ ├── release.yml │ ├── require-docs.yml │ └── tests.yml ├── .gitignore ├── .gitleaks.toml ├── ADOPTERS.md ├── CI ├── README.md ├── config │ └── common_test_config.yaml ├── legacy │ ├── scenarios │ │ ├── cluster_shut_down_scenario.yml │ │ ├── node_scenario.yml │ │ ├── volume_scenario.yaml │ │ ├── zone_outage.yaml │ │ └── zone_outage_env.yaml │ └── tests │ │ ├── test_nodes.sh │ │ ├── test_shut_down.sh │ │ └── test_zone.sh ├── run.sh ├── run_test.sh ├── templates │ ├── container_scenario_pod.yaml │ ├── outage_pod.yaml │ ├── service_hijacking.yaml │ └── time_pod.yaml └── tests │ ├── common.sh │ ├── functional_tests │ ├── test_app_outages.sh │ ├── test_container.sh │ ├── test_cpu_hog.sh │ ├── test_io_hog.sh │ ├── test_memory_hog.sh │ ├── test_namespace.sh │ ├── test_net_chaos.sh │ ├── test_service_hijacking.sh │ ├── test_telemetry.sh │ └── test_time.sh ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MAINTAINERS.md ├── README.md ├── ROADMAP.md ├── SECURITY.md ├── ansible ├── ansible.cfg ├── inventory ├── kraken.yml ├── templates │ └── kraken.j2 └── vars │ └── kraken_vars.yml ├── config ├── alerts.yaml ├── alerts_openshift.yaml ├── cerberus.yaml ├── config.yaml ├── config_kind.yaml ├── config_kubernetes.yaml ├── config_performance.yaml ├── metrics-aggregated.yaml ├── metrics-report.yaml ├── metrics.yaml └── recommender_config.yaml ├── containers ├── Dockerfile.template ├── README.md ├── build_own_image-README.md ├── compile_dockerfile.sh └── krknctl-input.json ├── kind-config.yml ├── krkn ├── __init__.py ├── cerberus │ ├── __init__.py │ └── setup.py ├── chaos_recommender │ ├── __init__.py │ ├── analysis.py │ ├── kraken_tests.py │ └── prometheus.py ├── invoke │ ├── __init__.py │ └── command.py ├── performance_dashboards │ ├── __init__.py │ └── setup.py ├── prometheus │ ├── __init__.py │ └── client.py ├── scenario_plugins │ ├── __init__.py │ ├── abstract_scenario_plugin.py │ ├── application_outage │ │ ├── __init__.py │ │ └── application_outage_scenario_plugin.py │ ├── container │ │ ├── __init__.py │ │ └── container_scenario_plugin.py │ ├── hogs │ │ ├── __init__.py │ │ └── hogs_scenario_plugin.py │ ├── managed_cluster │ │ ├── __init__.py │ │ ├── common_functions.py │ │ ├── managed_cluster_scenario_plugin.py │ │ └── scenarios.py │ ├── native │ │ ├── __init__.py │ │ ├── native_scenario_plugin.py │ │ ├── network │ │ │ ├── cerberus.py │ │ │ ├── ingress_shaping.py │ │ │ ├── job.j2 │ │ │ ├── kubernetes_functions.py │ │ │ ├── pod_interface.j2 │ │ │ └── pod_module.j2 │ │ ├── plugins.py │ │ ├── pod_network_outage │ │ │ ├── cerberus.py │ │ │ ├── job.j2 │ │ │ ├── kubernetes_functions.py │ │ │ ├── pod_module.j2 │ │ │ └── pod_network_outage_plugin.py │ │ └── run_python_plugin.py │ ├── network_chaos │ │ ├── __init__.py │ │ ├── job.j2 │ │ ├── network_chaos_scenario_plugin.py │ │ └── pod.j2 │ ├── network_chaos_ng │ │ ├── __init__.py │ │ ├── models.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── abstract_network_chaos_module.py │ │ │ ├── node_network_filter.py │ │ │ └── templates │ │ │ │ └── network-chaos.j2 │ │ ├── network_chaos_factory.py │ │ └── network_chaos_ng_scenario_plugin.py │ ├── node_actions │ │ ├── __init__.py │ │ ├── abstract_node_scenarios.py │ │ ├── alibaba_node_scenarios.py │ │ ├── aws_node_scenarios.py │ │ ├── az_node_scenarios.py │ │ ├── bm_node_scenarios.py │ │ ├── common_node_functions.py │ │ ├── docker_node_scenarios.py │ │ ├── gcp_node_scenarios.py │ │ ├── general_cloud_node_scenarios.py │ │ ├── ibmcloud_node_scenarios.py │ │ ├── node_actions_scenario_plugin.py │ │ ├── openstack_node_scenarios.py │ │ └── vmware_node_scenarios.py │ ├── pvc │ │ ├── __init__.py │ │ └── pvc_scenario_plugin.py │ ├── scenario_plugin_factory.py │ ├── service_disruption │ │ ├── __init__.py │ │ └── service_disruption_scenario_plugin.py │ ├── service_hijacking │ │ ├── __init__.py │ │ └── service_hijacking_scenario_plugin.py │ ├── shut_down │ │ ├── __init__.py │ │ └── shut_down_scenario_plugin.py │ ├── syn_flood │ │ ├── __init__.py │ │ └── syn_flood_scenario_plugin.py │ ├── time_actions │ │ ├── __init__.py │ │ └── time_actions_scenario_plugin.py │ └── zone_outage │ │ ├── __init__.py │ │ └── zone_outage_scenario_plugin.py ├── tests │ ├── __init__.py │ ├── test_classes │ │ ├── __init__.py │ │ ├── correct_scenario_plugin.py │ │ ├── duplicated_scenario_plugin.py │ │ ├── duplicated_two_scenario_plugin.py │ │ ├── example_scenario_plugin.py │ │ ├── snake_case_mismatch_scenario_plugin.py │ │ ├── wrong_classname_scenario_plugin.py │ │ └── wrong_module.py │ └── test_plugin_factory.py └── utils │ ├── HealthChecker.py │ ├── TeeLogHandler.py │ ├── __init__.py │ └── functions.py ├── media ├── KrakenStarting.png ├── kraken-workflow.png └── logo.png ├── rbac ├── non-privileged-role.yaml ├── non-privileged-rolebinding.yaml ├── privileged-clusterrole.yaml └── privileged-clusterrolebinding.yaml ├── requirements.txt ├── run_kraken.py ├── scenarios ├── kind │ ├── node_scenarios_example.yml │ └── scheduler.yml ├── kube │ ├── container_dns.yml │ ├── cpu-hog.yml │ ├── io-hog.yml │ ├── managedcluster_scenarios_example.yml │ ├── memory-hog.yml │ ├── network-filter.yml │ ├── pod.yml │ ├── scheduler.yml │ ├── service_hijacking.yaml │ └── syn_flood.yaml ├── openshift │ ├── app_outage.yaml │ ├── aws_node_scenarios.yml │ ├── azure_node_scenarios.yml │ ├── baremetal_node_scenarios.yml │ ├── cluster_shut_down_scenario.yml │ ├── container_etcd.yml │ ├── customapp_pod.yaml │ ├── etcd.yml │ ├── gcp_node_scenarios.yml │ ├── ibmcloud_node_scenarios.yml │ ├── ingress_namespace.yaml │ ├── network_chaos.yaml │ ├── network_chaos_ingress.yml │ ├── openshift-apiserver.yml │ ├── openshift-kube-apiserver.yml │ ├── pod_egress_shaping.yml │ ├── pod_ingress_shaping.yml │ ├── pod_network_outage.yml │ ├── prom_kill.yml │ ├── prometheus.yml │ ├── pvc_scenario.yaml │ ├── regex_namespace.yaml │ ├── regex_openshift_pod_kill.yml │ ├── time_scenarios_example.yml │ ├── vmware_node_scenarios.yml │ ├── zone_outage.yaml │ └── zone_outage_gcp.yaml ├── plugin.schema.README.md └── plugin.schema.json ├── server.py ├── setup.cfg ├── setup.py ├── tests ├── test_ingress_network_plugin.py └── test_run_python_plugin.py └── utils ├── arcaflow └── ocp-chaos │ ├── README.md │ ├── config.yaml │ ├── input.yaml │ ├── subworkflows │ ├── cpu-hog.yaml │ ├── kubeburner.yaml │ └── pod-chaos.yaml │ └── workflow.yaml ├── chaos_ai ├── README.md ├── config │ └── experiments │ │ └── .gitkeep ├── docker │ ├── Dockerfile │ ├── aichaos-config.json │ ├── config │ │ ├── experiments │ │ │ └── log.yml │ │ ├── pod-delete.json │ │ └── yml │ │ │ ├── chaosGen.yml │ │ │ ├── episodes.yml │ │ │ ├── log.yml │ │ │ ├── qtable.yml │ │ │ └── status.yml │ ├── requirements.txt │ └── swagger_api.py ├── generate_wheel_package.py ├── requirements.txt └── src │ ├── __init__.py │ ├── aichaos.py │ ├── aichaos_main.py │ ├── experiments.py │ ├── kraken_utils.py │ ├── qlearning.py │ ├── swagger_api.py │ ├── test_application.py │ └── utils.py └── chaos_recommender ├── README.md ├── chaos_recommender.py └── recommender_config.yaml /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | ## Documentation 5 | - [ ] **Is documentation needed for this update?** 6 | 7 | If checked, a documentation PR must be created and merged in the [website repository](https://github.com/krkn-chaos/website/). 8 | 9 | ## Related Documentation PR (if applicable) 10 | -------------------------------------------------------------------------------- /.github/release-template.md: -------------------------------------------------------------------------------- 1 | ## Release {VERSION} 2 | 3 | ### Download Artifacts 4 | - 📦 Krkn sources (noarch): [krkn-{VERSION}-src.tar.gz](https://krkn-chaos.gateway.scarf.sh/krkn-src-{VERSION}.tar.gz) 5 | 6 | ### Changes 7 | {CHANGES} 8 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/.github/release.yml -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | on: 3 | push: 4 | tags: ['v[0-9].[0-9]+.[0-9]+'] 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out code 12 | uses: actions/checkout@v3 13 | - name: Build the Docker images 14 | if: startsWith(github.ref, 'refs/tags') 15 | run: | 16 | ./containers/compile_dockerfile.sh 17 | docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg TAG=${GITHUB_REF#refs/tags/} 18 | docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn 19 | docker tag quay.io/krkn-chaos/krkn quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/} 20 | docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/} 21 | 22 | - name: Test Build the Docker images 23 | if: ${{ github.event_name == 'pull_request' }} 24 | run: | 25 | ./containers/compile_dockerfile.sh 26 | docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg PR_NUMBER=${{ github.event.pull_request.number }} 27 | - name: Login in quay 28 | if: startsWith(github.ref, 'refs/tags') 29 | run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN} 30 | env: 31 | QUAY_USER: ${{ secrets.QUAY_USERNAME }} 32 | QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }} 33 | - name: Push the KrknChaos Docker images 34 | if: startsWith(github.ref, 'refs/tags') 35 | run: | 36 | docker push quay.io/krkn-chaos/krkn 37 | docker push quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/} 38 | - name: Login in to redhat-chaos quay 39 | if: startsWith(github.ref, 'refs/tags/v') 40 | run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN} 41 | env: 42 | QUAY_USER: ${{ secrets.QUAY_USER_1 }} 43 | QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }} 44 | - name: Push the RedHat Chaos Docker images 45 | if: startsWith(github.ref, 'refs/tags') 46 | run: | 47 | docker push quay.io/redhat-chaos/krkn 48 | docker push quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/} 49 | - name: Rebuild krkn-hub 50 | if: startsWith(github.ref, 'refs/tags') 51 | uses: redhat-chaos/actions/krkn-hub@main 52 | with: 53 | QUAY_USER: ${{ secrets.QUAY_USERNAME }} 54 | QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }} 55 | AUTOPUSH: ${{ secrets.AUTOPUSH }} 56 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create Release 2 | on: 3 | push: 4 | tags: 5 | - 'v*' 6 | jobs: 7 | release: 8 | permissions: 9 | contents: write 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: calculate previous tag 14 | run: | 15 | git fetch --tags origin 16 | PREVIOUS_TAG=$(git tag --sort=-creatordate | sed -n '2 p') 17 | echo $PREVIOUS_TAG 18 | echo "PREVIOUS_TAG=$PREVIOUS_TAG" >> "$GITHUB_ENV" 19 | - name: generate release notes from template 20 | id: release-notes 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | run: | 24 | NOTES=$(gh api \ 25 | --method POST \ 26 | -H "Accept: application/vnd.github+json" \ 27 | -H "X-GitHub-Api-Version: 2022-11-28" \ 28 | /repos/krkn-chaos/krkn/releases/generate-notes \ 29 | -f "tag_name=${{ github.ref_name }}" -f "target_commitish=main" -f "previous_tag_name=${{ env.PREVIOUS_TAG }}" | jq -r .body) 30 | echo "NOTES<> $GITHUB_ENV 31 | echo "$NOTES" >> $GITHUB_ENV 32 | echo "EOF" >> $GITHUB_ENV 33 | 34 | - name: replace placeholders in template 35 | run: | 36 | echo "${{ env.NOTES }}" 37 | TEMPLATE=$(cat .github/release-template.md) 38 | VERSION=${{ github.ref_name }} 39 | NOTES="${{ env.NOTES }}" 40 | OUTPUT=${TEMPLATE//\{VERSION\}/$VERSION} 41 | OUTPUT=${OUTPUT//\{CHANGES\}/$NOTES} 42 | echo "$OUTPUT" > release-notes.md 43 | - name: create release 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | run: | 47 | gh release create ${{ github.ref_name }} --title "${{ github.ref_name }}" -F release-notes.md 48 | -------------------------------------------------------------------------------- /.github/workflows/require-docs.yml: -------------------------------------------------------------------------------- 1 | name: Require Documentation Update 2 | on: 3 | pull_request: 4 | types: [opened, edited, synchronize] 5 | branches: 6 | - main 7 | jobs: 8 | check-docs: 9 | name: Check Documentation Update 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v4 14 | 15 | - name: Check if Documentation is Required 16 | id: check_docs 17 | run: | 18 | echo "Checking PR body for documentation checkbox..." 19 | # Read the PR body from the GitHub event payload 20 | if echo "${{ github.event.pull_request.body }}" | grep -qi '\[x\].*documentation needed'; then 21 | echo "Documentation required detected." 22 | echo "docs_required=true" >> $GITHUB_OUTPUT 23 | else 24 | echo "Documentation not required." 25 | echo "docs_required=false" >> $GITHUB_OUTPUT 26 | fi 27 | 28 | - name: Enforce Documentation Update (if required) 29 | if: steps.check_docs.outputs.docs_required == 'true' 30 | env: 31 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 32 | run: | 33 | # Retrieve feature branch and repository owner from the GitHub context 34 | FEATURE_BRANCH="${{ github.head_ref }}" 35 | REPO_OWNER="${{ github.repository_owner }}" 36 | WEBSITE_REPO="website" 37 | echo "Searching for a merged documentation PR for feature branch: $FEATURE_BRANCH in $REPO_OWNER/$WEBSITE_REPO..." 38 | MERGED_PR=$(gh pr list --repo "$REPO_OWNER/$WEBSITE_REPO" --state merged --json headRefName,title,url | jq -r \ 39 | --arg FEATURE_BRANCH "$FEATURE_BRANCH" '.[] | select(.title | contains($FEATURE_BRANCH)) | .url') 40 | if [[ -z "$MERGED_PR" ]]; then 41 | echo ":x: Documentation PR for branch '$FEATURE_BRANCH' is required and has not been merged." 42 | exit 1 43 | else 44 | echo ":white_check_mark: Found merged documentation PR: $MERGED_PR" 45 | fi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary and binary files 2 | *~ 3 | *.py[cod] 4 | *.so 5 | *.cfg 6 | !.isort.cfg 7 | !setup.cfg 8 | *.orig 9 | *.log 10 | *.pot 11 | __pycache__/* 12 | .cache/* 13 | .*.swp 14 | */.ipynb_checkpoints/* 15 | .DS_Store 16 | *.out 17 | kube-burner* 18 | kube_burner* 19 | recommender_*.json 20 | 21 | # Project files 22 | .ropeproject 23 | .project 24 | .pydevproject 25 | .settings 26 | .idea 27 | .vscode 28 | config/debug.yaml 29 | tags 30 | 31 | # Package files 32 | *.egg 33 | *.eggs/ 34 | .installed.cfg 35 | *.egg-info 36 | 37 | # Unittest and coverage 38 | htmlcov/* 39 | .coverage 40 | junit.xml 41 | coverage.xml 42 | .pytest_cache/ 43 | 44 | # Build and docs folder/files 45 | build/* 46 | dist/* 47 | sdist/* 48 | docs/api/* 49 | docs/_rst/* 50 | docs/_build/* 51 | cover/* 52 | MANIFEST 53 | 54 | # Per-project virtualenvs 55 | .venv*/ 56 | venv*/ 57 | kraken.report 58 | collected-metrics/* 59 | inspect.local.* 60 | 61 | # Tests 62 | !CI/config/common_test_config.yaml 63 | CI/out/* 64 | CI/ci_results 65 | CI/legacy/*node.yaml 66 | CI/results.markdown 67 | 68 | #env 69 | chaos/* 70 | 71 | -------------------------------------------------------------------------------- /.gitleaks.toml: -------------------------------------------------------------------------------- 1 | [allowlist] 2 | description = "Global Allowlist" 3 | 4 | paths = [ 5 | '''kraken/arcaflow_plugin/fixtures/*''' 6 | ] 7 | -------------------------------------------------------------------------------- /ADOPTERS.md: -------------------------------------------------------------------------------- 1 | # Krkn Adopters 2 | 3 | This is a list of organizations that have publicly acknowledged usage of Krkn and shared details of how they are leveraging it in their environment for chaos engineering use cases. Do you want to add yourself to this list? Please fork the repository and open a PR with the required change. 4 | 5 | | Organization | Since | Website | Use-Case | 6 | |:-|:-|:-|:-| 7 | | MarketAxess | 2024 | https://www.marketaxess.com/ | Kraken enables us to achieve our goal of increasing the reliability of our cloud products on Kubernetes. The tool allows us to automatically run various chaos scenarios, identify resilience and performance bottlenecks, and seamlessly restore the system to its original state once scenarios finish. These chaos scenarios include pod disruptions, node (EC2) outages, simulating availability zone (AZ) outages, and filling up storage spaces like EBS and EFS. The community is highly responsive to requests and works on expanding the tool's capabilities. MarketAxess actively contributes to the project, adding features such as the ability to leverage existing network ACLs and proposing several feature improvements to enhance test coverage. | 8 | | Red Hat Openshift | 2020 | https://www.redhat.com/ | Kraken is a highly reliable chaos testing tool used to ensure the quality and resiliency of Red Hat Openshift. The engineering team runs all the test scenarios under Kraken on different cloud platforms on both self-managed and cloud services environments prior to the release of a new version of the product. The team also contributes to the Kraken project consistently which helps the test scenarios to keep up with the new features introduced to the product. Inclusion of this test coverage has contributed to gaining the trust of new and existing customers of the product. | 9 | -------------------------------------------------------------------------------- /CI/README.md: -------------------------------------------------------------------------------- 1 | ## CI Tests 2 | 3 | ### First steps 4 | Edit [functional_tests](tests/functional_tests) with tests you want to run 5 | 6 | ### How to run 7 | ```./CI/run.sh``` 8 | 9 | This will run kraken using python, make sure python3 is set up and configured properly with all requirements 10 | 11 | 12 | ### Adding a test case 13 | 14 | 1. Add in simple scenario yaml file to execute under [../CI/scenarios/](legacy) 15 | 16 | 2. Copy [test_application_outages.sh](tests/test_app_outages.sh) for example on how to get started 17 | 18 | 3. Lines to change for bash script 19 | 20 | a. 11: Set scenario type to be your new scenario name 21 | 22 | b. 12: Add pointer to scenario file for the test 23 | 24 | c. 13: If a post action file is needed; add in pointer 25 | 26 | d. 14: Set filled in config yaml file name specific to your scenario 27 | 28 | e. 15: Make sure name of config in line 14 matches what you pass on this line 29 | 30 | 4. Add test name to [functional_tests](../CI/tests/functional_tests) file 31 | 32 | a. This will be the name of the file without ".sh" 33 | 34 | 5. If any changes to the main config (other than the scenario list), please be sure to add them into the [common_config](config/common_test_config.yaml) 35 | -------------------------------------------------------------------------------- /CI/legacy/scenarios/cluster_shut_down_scenario.yml: -------------------------------------------------------------------------------- 1 | cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes 2 | runs: 1 # Number of times to execute the cluster_shut_down scenario 3 | shut_down_duration: 10 # duration in seconds to shut down the cluster 4 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs 5 | timeout: 60 # Number of seconds to wait for each node to be stopped or running 6 | -------------------------------------------------------------------------------- /CI/legacy/scenarios/node_scenario.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: # node chaos scenarios to be injected 3 | - node_reboot_scenario 4 | node_name: # node on which scenario has to be injected 5 | label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection 6 | instance_kill_count: 1 # number of times to inject each scenario under actions 7 | timeout: 80 # duration to wait for completion of node scenario injection 8 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs 9 | -------------------------------------------------------------------------------- /CI/legacy/scenarios/volume_scenario.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | kubernetes.io/metadata.name: kraken 6 | pod-security.kubernetes.io/audit: privileged 7 | pod-security.kubernetes.io/enforce: privileged 8 | pod-security.kubernetes.io/enforce-version: v1.24 9 | pod-security.kubernetes.io/warn: privileged 10 | security.openshift.io/scc.podSecurityLabelSync: "false" 11 | name: kraken 12 | --- 13 | apiVersion: v1 14 | kind: PersistentVolume 15 | metadata: 16 | name: kraken-test-pv 17 | namespace: kraken 18 | labels: 19 | type: local 20 | spec: 21 | storageClassName: manual 22 | capacity: 23 | storage: 2Gi 24 | accessModes: 25 | - ReadWriteOnce 26 | hostPath: 27 | path: "/mnt/data" 28 | --- 29 | apiVersion: v1 30 | kind: PersistentVolumeClaim 31 | metadata: 32 | name: kraken-test-pvc 33 | namespace: kraken 34 | spec: 35 | storageClassName: manual 36 | accessModes: 37 | - ReadWriteOnce 38 | resources: 39 | requests: 40 | storage: 1Gi 41 | --- 42 | apiVersion: v1 43 | kind: Pod 44 | metadata: 45 | name: kraken-test-pod 46 | namespace: kraken 47 | spec: 48 | volumes: 49 | - name: kraken-test-pv 50 | persistentVolumeClaim: 51 | claimName: kraken-test-pvc 52 | containers: 53 | - name: kraken-test-container 54 | image: 'quay.io/centos7/httpd-24-centos7:latest' 55 | volumeMounts: 56 | - mountPath: "/home/krake-dir/" 57 | name: kraken-test-pv 58 | securityContext: 59 | privileged: true 60 | -------------------------------------------------------------------------------- /CI/legacy/scenarios/zone_outage.yaml: -------------------------------------------------------------------------------- 1 | zone_outage: # Scenario to create an outage of a zone by tweaking network ACL 2 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario. 3 | duration: 10 # duration in seconds after which the zone will be back online 4 | vpc_id: $VPC_ID # cluster virtual private network to target 5 | subnet_id: $SUBNET_ID # List of subnet-id's to deny both ingress and egress traffic 6 | -------------------------------------------------------------------------------- /CI/legacy/scenarios/zone_outage_env.yaml: -------------------------------------------------------------------------------- 1 | zone_outage: # Scenario to create an outage of a zone by tweaking network ACL 2 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario. 3 | duration: 10 # duration in seconds after which the zone will be back online 4 | vpc_id: vpc-0b43122e2d2ee058f # cluster virtual private network to target 5 | subnet_id: [subnet-088c73e73587d8aba] # List of subnet-id's to deny both ingress and egress traffic 6 | -------------------------------------------------------------------------------- /CI/legacy/tests/test_nodes.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function funtional_test_node_crash { 10 | 11 | export scenario_type="node_scenarios" 12 | export scenario_file="CI/scenarios/node_scenario.yml" 13 | export post_config="" 14 | envsubst < CI/config/common_test_config.yaml > CI/config/node_config.yaml 15 | 16 | python3 -m coverage run -a run_kraken.py -c CI/config/node_config.yaml 17 | echo "Node scenario test: Success" 18 | } 19 | 20 | funtional_test_node_crash 21 | -------------------------------------------------------------------------------- /CI/legacy/tests/test_shut_down.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_shut_down { 10 | 11 | export scenario_type="cluster_shut_down_scenarios" 12 | export scenario_file="- CI/scenarios/cluster_shut_down_scenario.yml" 13 | export post_config="" 14 | envsubst < CI/config/common_test_config.yaml > CI/config/shut_down.yaml 15 | python3 -m coverage run -a run_kraken.py -c CI/config/shut_down.yaml 16 | echo "Cluster shut down scenario test: Success" 17 | } 18 | 19 | functional_test_shut_down 20 | -------------------------------------------------------------------------------- /CI/legacy/tests/test_zone.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_zone_crash { 10 | 11 | export scenario_type="zone_outages" 12 | export scenario_file="CI/scenarios/zone_outage_env.yaml" 13 | export post_config="" 14 | envsubst < CI/config/common_test_config.yaml > CI/config/zone3_config.yaml 15 | envsubst < CI/scenarios/zone_outage.yaml > CI/scenarios/zone_outage_env.yaml 16 | python3 -m coverage run -a run_kraken.py -c CI/config/zone3_config.yaml 17 | echo "zone3 scenario test: Success" 18 | } 19 | 20 | functional_test_zone_crash 21 | -------------------------------------------------------------------------------- /CI/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MAX_RETRIES=60 3 | 4 | KUBECTL=`which kubectl 2>/dev/null` 5 | [[ $? != 0 ]] && echo "[ERROR]: kubectl missing, please install it and try again" && exit 1 6 | 7 | wait_cluster_become_ready() { 8 | COUNT=1 9 | until `$KUBECTL get namespace > /dev/null 2>&1` 10 | do 11 | echo "[INF] waiting Kubernetes to become ready, after $COUNT check" 12 | sleep 3 13 | [[ $COUNT == $MAX_RETRIES ]] && echo "[ERR] max retries exceeded, failing" && exit 1 14 | ((COUNT++)) 15 | done 16 | } 17 | 18 | 19 | 20 | ci_tests_loc="CI/tests/functional_tests" 21 | 22 | echo -e "********* Running Functional Tests Suite *********\n\n" 23 | 24 | rm -rf CI/out 25 | 26 | mkdir CI/out 27 | 28 | results_file_name="results.markdown" 29 | 30 | rm -f CI/$results_file_name 31 | 32 | results="CI/$results_file_name" 33 | 34 | # Prep the results.markdown file 35 | echo 'Test | Result | Duration' >> $results 36 | echo '-----------------------|--------|---------' >> $results 37 | 38 | # Run each test 39 | failed_tests=() 40 | for test_name in `cat CI/tests/functional_tests` 41 | do 42 | #wait_cluster_become_ready 43 | return_value=`./CI/run_test.sh $test_name $results` 44 | if [[ $return_value == 1 ]] 45 | then 46 | echo "Failed" 47 | failed_tests+=("$test_name") 48 | fi 49 | wait_cluster_become_ready 50 | done 51 | 52 | 53 | if (( ${#failed_tests[@]}>0 )) 54 | then 55 | echo -e "\n\n======================================================================" 56 | echo -e "\n FUNCTIONAL TESTS FAILED ${failed_tests[*]} ABORTING" 57 | echo -e "\n======================================================================\n\n" 58 | 59 | for test in "${failed_tests[@]}" 60 | do 61 | echo -e "\n********** $test KRKN RUN OUTPUT **********\n" 62 | cat "CI/out/$test.out" 63 | echo -e "\n********************************************\n\n\n\n" 64 | done 65 | 66 | exit 1 67 | fi 68 | -------------------------------------------------------------------------------- /CI/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | readonly SECONDS_PER_HOUR=3600 3 | readonly SECONDS_PER_MINUTE=60 4 | function get_time_format() { 5 | seconds=$1 6 | hours=$((${seconds} / ${SECONDS_PER_HOUR})) 7 | seconds=$((${seconds} % ${SECONDS_PER_HOUR})) 8 | minutes=$((${seconds} / ${SECONDS_PER_MINUTE})) 9 | seconds=$((${seconds} % ${SECONDS_PER_MINUTE})) 10 | echo $hours:$minutes:$seconds 11 | } 12 | ci_test=`echo $1` 13 | 14 | results_file=$2 15 | 16 | echo -e "test: ${ci_test}" >&2 17 | 18 | ci_results="CI/out/$ci_test.out" 19 | # Test ci 20 | 21 | echo "results $ci_results" >> $ci_results 22 | SECONDS=0 23 | if /bin/bash CI/tests/$ci_test.sh >> $ci_results 2>&1 24 | then 25 | # if the test passes update the results and complete 26 | duration=$SECONDS 27 | duration=$(get_time_format $duration) 28 | echo -e "> $ci_test: Successful\n" >&2 29 | echo "$ci_test | Pass | $duration" >> $results_file 30 | count=$retries 31 | # return value for run.sh 32 | echo 0 33 | else 34 | duration=$SECONDS 35 | duration=$(get_time_format $duration) 36 | echo -e "> $ci_test: Failed\n" >&2 37 | echo "$ci_test | Fail | $duration" >> $results_file 38 | # return value for run.sh 39 | echo 1 40 | fi 41 | -------------------------------------------------------------------------------- /CI/templates/container_scenario_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: container 5 | labels: 6 | scenario: container 7 | spec: 8 | hostNetwork: true 9 | containers: 10 | - name: fedtools 11 | image: docker.io/fedora/tools 12 | command: 13 | - /bin/sh 14 | - -c 15 | - | 16 | sleep infinity -------------------------------------------------------------------------------- /CI/templates/outage_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: outage 5 | labels: 6 | scenario: outage 7 | spec: 8 | hostNetwork: true 9 | containers: 10 | - name: fedtools 11 | image: docker.io/fedora/tools 12 | command: 13 | - /bin/sh 14 | - -c 15 | - | 16 | sleep infinity -------------------------------------------------------------------------------- /CI/templates/service_hijacking.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nginx 5 | labels: 6 | app.kubernetes.io/name: proxy 7 | spec: 8 | containers: 9 | - name: nginx 10 | image: nginx:stable 11 | ports: 12 | - containerPort: 80 13 | name: http-web-svc 14 | 15 | --- 16 | apiVersion: v1 17 | kind: Service 18 | metadata: 19 | name: nginx-service 20 | spec: 21 | selector: 22 | app.kubernetes.io/name: proxy 23 | type: NodePort 24 | ports: 25 | - name: name-of-service-port 26 | protocol: TCP 27 | port: 80 28 | targetPort: http-web-svc 29 | nodePort: 30036 -------------------------------------------------------------------------------- /CI/templates/time_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: time-skew 5 | labels: 6 | scenario: time-skew 7 | spec: 8 | hostNetwork: true 9 | containers: 10 | - name: fedtools 11 | image: docker.io/fedora/tools 12 | command: 13 | - /bin/sh 14 | - -c 15 | - | 16 | sleep infinity -------------------------------------------------------------------------------- /CI/tests/common.sh: -------------------------------------------------------------------------------- 1 | ERRORED=false 2 | 3 | function finish { 4 | if [ $? != 0 ] && [ $ERRORED != "true" ] 5 | then 6 | error 7 | fi 8 | } 9 | 10 | function error { 11 | exit_code=$? 12 | if [ $exit_code == 1 ] 13 | then 14 | echo "Error caught." 15 | ERRORED=true 16 | elif [ $exit_code == 2 ] 17 | then 18 | echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure" 19 | exit 0 20 | fi 21 | } 22 | 23 | function get_node { 24 | worker_node=$(kubectl get nodes --no-headers | grep worker | head -n 1) 25 | export WORKER_NODE=$worker_node 26 | } 27 | -------------------------------------------------------------------------------- /CI/tests/functional_tests: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CI/tests/test_app_outages.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_app_outage { 10 | yq -i '.application_outage.duration=10' scenarios/openshift/app_outage.yaml 11 | yq -i '.application_outage.pod_selector={"scenario":"outage"}' scenarios/openshift/app_outage.yaml 12 | yq -i '.application_outage.namespace="default"' scenarios/openshift/app_outage.yaml 13 | export scenario_type="application_outages_scenarios" 14 | export scenario_file="scenarios/openshift/app_outage.yaml" 15 | export post_config="" 16 | envsubst < CI/config/common_test_config.yaml > CI/config/app_outage.yaml 17 | python3 -m coverage run -a run_kraken.py -c CI/config/app_outage.yaml 18 | echo "App outage scenario test: Success" 19 | } 20 | 21 | functional_test_app_outage 22 | -------------------------------------------------------------------------------- /CI/tests/test_container.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | pod_file="CI/scenarios/hello_pod.yaml" 9 | 10 | function functional_test_container_crash { 11 | yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml 12 | yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml 13 | yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml 14 | export scenario_type="container_scenarios" 15 | export scenario_file="scenarios/openshift/container_etcd.yml" 16 | export post_config="" 17 | envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml 18 | 19 | python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml 20 | echo "Container scenario test: Success" 21 | } 22 | 23 | functional_test_container_crash 24 | -------------------------------------------------------------------------------- /CI/tests/test_cpu_hog.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_cpu_hog { 10 | yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/cpu-hog.yml 11 | 12 | export scenario_type="hog_scenarios" 13 | export scenario_file="scenarios/kube/cpu-hog.yml" 14 | export post_config="" 15 | envsubst < CI/config/common_test_config.yaml > CI/config/cpu_hog.yaml 16 | python3 -m coverage run -a run_kraken.py -c CI/config/cpu_hog.yaml 17 | echo "CPU Hog: Success" 18 | } 19 | 20 | functional_test_cpu_hog -------------------------------------------------------------------------------- /CI/tests/test_io_hog.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_io_hog { 10 | yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/io-hog.yml 11 | export scenario_type="hog_scenarios" 12 | export scenario_file="scenarios/kube/io-hog.yml" 13 | export post_config="" 14 | envsubst < CI/config/common_test_config.yaml > CI/config/io_hog.yaml 15 | python3 -m coverage run -a run_kraken.py -c CI/config/io_hog.yaml 16 | echo "IO Hog: Success" 17 | } 18 | 19 | functional_test_io_hog -------------------------------------------------------------------------------- /CI/tests/test_memory_hog.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_memory_hog { 10 | yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/memory-hog.yml 11 | export scenario_type="hog_scenarios" 12 | export scenario_file="scenarios/kube/memory-hog.yml" 13 | export post_config="" 14 | envsubst < CI/config/common_test_config.yaml > CI/config/memory_hog.yaml 15 | python3 -m coverage run -a run_kraken.py -c CI/config/memory_hog.yaml 16 | echo "Memory Hog: Success" 17 | } 18 | 19 | functional_test_memory_hog -------------------------------------------------------------------------------- /CI/tests/test_namespace.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | function funtional_test_namespace_deletion { 9 | export scenario_type="service_disruption_scenarios" 10 | export scenario_file="scenarios/openshift/ingress_namespace.yaml" 11 | export post_config="" 12 | yq '.scenarios[0].namespace="^namespace-scenario$"' -i scenarios/openshift/ingress_namespace.yaml 13 | yq '.scenarios[0].wait_time=30' -i scenarios/openshift/ingress_namespace.yaml 14 | yq '.scenarios[0].action="delete"' -i scenarios/openshift/ingress_namespace.yaml 15 | envsubst < CI/config/common_test_config.yaml > CI/config/namespace_config.yaml 16 | python3 -m coverage run -a run_kraken.py -c CI/config/namespace_config.yaml 17 | echo "Namespace scenario test: Success" 18 | } 19 | 20 | funtional_test_namespace_deletion 21 | -------------------------------------------------------------------------------- /CI/tests/test_net_chaos.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_network_chaos { 10 | yq -i '.network_chaos.duration=10' scenarios/openshift/network_chaos.yaml 11 | yq -i '.network_chaos.node_name="kind-worker2"' scenarios/openshift/network_chaos.yaml 12 | yq -i '.network_chaos.egress.bandwidth="100mbit"' scenarios/openshift/network_chaos.yaml 13 | yq -i 'del(.network_chaos.interfaces)' scenarios/openshift/network_chaos.yaml 14 | yq -i 'del(.network_chaos.label_selector)' scenarios/openshift/network_chaos.yaml 15 | yq -i 'del(.network_chaos.egress.latency)' scenarios/openshift/network_chaos.yaml 16 | yq -i 'del(.network_chaos.egress.loss)' scenarios/openshift/network_chaos.yaml 17 | 18 | export scenario_type="network_chaos_scenarios" 19 | export scenario_file="scenarios/openshift/network_chaos.yaml" 20 | export post_config="" 21 | envsubst < CI/config/common_test_config.yaml > CI/config/network_chaos.yaml 22 | python3 -m coverage run -a run_kraken.py -c CI/config/network_chaos.yaml 23 | echo "Network Chaos test: Success" 24 | } 25 | 26 | functional_test_network_chaos 27 | -------------------------------------------------------------------------------- /CI/tests/test_telemetry.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_telemetry { 10 | AWS_CLI=`which aws` 11 | [ -z "$AWS_CLI" ]&& echo "AWS cli not found in path" && exit 1 12 | [ -z "$AWS_BUCKET" ] && echo "AWS bucket not set in environment" && exit 1 13 | 14 | export RUN_TAG="funtest-telemetry" 15 | yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml 16 | yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml 17 | yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml 18 | yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml 19 | yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml 20 | 21 | export scenario_type="hog_scenarios" 22 | 23 | export scenario_file="scenarios/kube/cpu-hog.yml" 24 | 25 | export post_config="" 26 | envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml 27 | retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml) 28 | RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"` 29 | $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files 30 | echo "checking if telemetry files are uploaded on s3" 31 | cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 ) 32 | cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 ) 33 | cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 ) 34 | echo "all files uploaded!" 35 | echo "Telemetry Collection: Success" 36 | } 37 | 38 | functional_test_telemetry -------------------------------------------------------------------------------- /CI/tests/test_time.sh: -------------------------------------------------------------------------------- 1 | set -xeEo pipefail 2 | 3 | source CI/tests/common.sh 4 | 5 | trap error ERR 6 | trap finish EXIT 7 | 8 | 9 | function functional_test_time_scenario { 10 | yq -i '.time_scenarios[0].label_selector="scenario=time-skew"' scenarios/openshift/time_scenarios_example.yml 11 | yq -i '.time_scenarios[0].container_name=""' scenarios/openshift/time_scenarios_example.yml 12 | yq -i '.time_scenarios[0].namespace="default"' scenarios/openshift/time_scenarios_example.yml 13 | yq -i '.time_scenarios[1].label_selector="kubernetes.io/hostname=kind-worker2"' scenarios/openshift/time_scenarios_example.yml 14 | export scenario_type="time_scenarios" 15 | export scenario_file="scenarios/openshift/time_scenarios_example.yml" 16 | export post_config="" 17 | envsubst < CI/config/common_test_config.yaml > CI/config/time_config.yaml 18 | 19 | python3 -m coverage run -a run_kraken.py -c CI/config/time_config.yaml 20 | echo "Time scenario test: Success" 21 | } 22 | 23 | functional_test_time_scenario 24 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This document contains a list of maintainers in this repo. 4 | 5 | ## Current Maintainers 6 | 7 | | Maintainer | GitHub ID | Email | 8 | |---------------------| --------------------------------------------------------- | ----------------------- | 9 | | Ravi Elluri | [chaitanyaenr](https://github.com/chaitanyaenr) | nelluri@redhat.com | 10 | | Pradeep Surisetty | [psuriset](https://github.com/psuriset) | psuriset@redhat.com | 11 | | Paige Rubendall | [paigerube14](https://github.com/paigerube14) | prubenda@redhat.com | 12 | | Tullio Sebastiani | [tsebastiani](https://github.com/tsebastiani) | tsebasti@redhat.com | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Krkn aka Kraken 2 | ![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg) 3 | ![coverage](https://krkn-chaos.github.io/krkn-lib-docs/coverage_badge_krkn.svg) 4 | ![action](https://github.com/krkn-chaos/krkn/actions/workflows/tests.yml/badge.svg) 5 | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10548/badge)](https://www.bestpractices.dev/projects/10548) 6 | 7 | ![Krkn logo](media/logo.png) 8 | 9 | Chaos and resiliency testing tool for Kubernetes. 10 | Kraken injects deliberate failures into Kubernetes clusters to check if it is resilient to turbulent conditions. 11 | 12 | 13 | ### Workflow 14 | ![Kraken workflow](media/kraken-workflow.png) 15 | 16 | 17 | 19 | 20 | 21 | ### How to Get Started 22 | Instructions on how to setup, configure and run Kraken can be found in the [documentation](https://krkn-chaos.dev/docs/). 23 | 24 | 25 | ### Blogs and other useful resources 26 | - Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes 27 | - Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift 28 | - Blog post emphasizing the importance of making Chaos part of Performance and Scale runs to mimic the production environments: https://www.openshift.com/blog/making-chaos-part-of-kubernetes/openshift-performance-and-scalability-tests 29 | - Blog post on findings from Chaos test runs: https://cloud.redhat.com/blog/openshift/kubernetes-chaos-stories 30 | - Discussion with CNCF TAG App Delivery on Krkn workflow, features and addition to CNCF sandbox: [Github](https://github.com/cncf/sandbox/issues/44), [Tracker](https://github.com/cncf/tag-app-delivery/issues/465), [recording](https://www.youtube.com/watch?v=nXQkBFK_MWc&t=722s) 31 | - Blog post on supercharging chaos testing using AI integration in Krkn: https://www.redhat.com/en/blog/supercharging-chaos-testing-using-ai 32 | - Blog post announcing Krkn joining CNCF Sandbox: https://www.redhat.com/en/blog/krknchaos-joining-cncf-sandbox 33 | 34 | 35 | ### Roadmap 36 | Enhancements being planned can be found in the [roadmap](ROADMAP.md). 37 | 38 | 39 | ### Contributions 40 | We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github. 41 | 42 | [More information on how to Contribute](https://krkn-chaos.dev/docs/contribution-guidelines/) 43 | 44 | 45 | ### Community 46 | Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, tsebasti/Tullio Sebastiani, yogi/Yogananth Subramanian, sahil/Sahil Shah, pradeep/Pradeep Surisetty and ravielluri/Naga Ravi Chaitanya Elluri. 47 | * [**#krkn on Kubernetes Slack**](https://kubernetes.slack.com/messages/C05SFMHRWK1) 48 | 49 | The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/legal/trademark-usage). 50 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | ## Krkn Roadmap 2 | 3 | Following are a list of enhancements that we are planning to work on adding support in Krkn. Of course any help/contributions are greatly appreciated. 4 | 5 | - [ ] [Ability to run multiple chaos scenarios in parallel under load to mimic real world outages](https://github.com/krkn-chaos/krkn/issues/424) 6 | - [x] [Centralized storage for chaos experiments artifacts](https://github.com/krkn-chaos/krkn/issues/423) 7 | - [ ] [Support for causing DNS outages](https://github.com/krkn-chaos/krkn/issues/394) 8 | - [x] [Chaos recommender](https://github.com/krkn-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results 9 | - [] Chaos AI integration to improve test coverage while reducing fault space to save costs and execution time 10 | - [x] [Support for pod level network traffic shaping](https://github.com/krkn-chaos/krkn/issues/393) 11 | - [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/krkn-chaos/krkn/issues/124) 12 | - [x] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186 13 | - [x] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions. 14 | - [x] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495) 15 | - [x] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497) 16 | - [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl) 17 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | We attach great importance to code security. We are very grateful to the users, security vulnerability researchers, etc. for reporting security vulnerabilities to the Krkn community. All reported security vulnerabilities will be carefully assessed and addressed in a timely manner. 4 | 5 | 6 | ## Security Checks 7 | 8 | Krkn leverages [Snyk](https://snyk.io/) to ensure that any security vulnerabilities found 9 | in the code base and dependencies are fixed and published in the latest release. Security 10 | vulnerability checks are enabled for each pull request to enable developers to get insights 11 | and proactively fix them. 12 | 13 | 14 | ## Reporting a Vulnerability 15 | 16 | The Krkn project treats security vulnerabilities seriously, so we 17 | strive to take action quickly when required. 18 | 19 | The project requests that security issues be disclosed in a responsible 20 | manner to allow adequate time to respond. If a security issue or 21 | vulnerability has been found, please disclose the details to our 22 | dedicated email address: 23 | 24 | cncf-krkn-maintainers@lists.cncf.io 25 | 26 | You can also use the [GitHub vulnerability report mechanism](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability#privately-reporting-a-security-vulnerability) to report the security vulnerability. 27 | 28 | Please include as much information as possible with the report. The 29 | following details assist with analysis efforts: 30 | - Description of the vulnerability 31 | - Affected component (version, commit, branch etc) 32 | - Affected code (file path, line numbers) 33 | - Exploit code 34 | 35 | 36 | ## Security Team 37 | 38 | The security team currently consists of the [Maintainers of Krkn](https://github.com/krkn-chaos/krkn/blob/main/MAINTAINERS.md) 39 | 40 | 41 | ## Process and Supported Releases 42 | 43 | The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes. 44 | -------------------------------------------------------------------------------- /ansible/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | callback_whitelist = profile_tasks 3 | host_key_checking = False 4 | log_path = ~/ansible.log 5 | retry_files_enabled = False 6 | # work around privilege escalation timeouts in ansible: 7 | timeout = 30 8 | 9 | [callback_profile_tasks] 10 | task_output_limit = 10000 11 | sort_order = none 12 | -------------------------------------------------------------------------------- /ansible/inventory: -------------------------------------------------------------------------------- 1 | [orchestration] 2 | -------------------------------------------------------------------------------- /ansible/kraken.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: orchestration 3 | gather_facts: true 4 | remote_user: "{{ orchestration_user }}" 5 | vars_files: 6 | - vars/kraken_vars.yml 7 | 8 | tasks: 9 | - name: Git clone kraken repository 10 | git: 11 | repo: "{{ kraken_repository }}" 12 | dest: "{{ kraken_dir }}" 13 | force: yes 14 | 15 | - name: Generate kraken config file 16 | template: 17 | src: kraken.j2 18 | dest: "{{ kraken_config }}" 19 | 20 | - name: Start injecting failures 21 | shell: | 22 | cd "{{ kraken_dir }}" 23 | cp -r "{{ scenarios_folder_path }}"* scenarios/ 24 | unset CONFIG 25 | python3 run_kraken.py 26 | ignore_errors: yes 27 | -------------------------------------------------------------------------------- /ansible/templates/kraken.j2: -------------------------------------------------------------------------------- 1 | kraken: 2 | kubeconfig_path: {{ kubeconfig_path }} # Path to kubeconfig 3 | exit_on_failure: {{ exit_on_failure }} # Exit when a post action scenario fails 4 | scenarios: {{ scenarios }} # List of policies/chaos scenarios to load 5 | 6 | cerberus: 7 | cerberus_enabled: {{ cerberus_enabled }} # Enable it when cerberus is previously installed 8 | cerberus_url: {{ cerberus_url }} # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal 9 | 10 | tunings: 11 | wait_duration: {{ wait_duration }} # Duration to wait between each chaos scenario 12 | iterations: {{ iterations }} # Number of times to execute the scenarios 13 | daemon_mode: {{ daemon_mode }} # Iterations are set to infinity which means that the cerberus will monitor the resources forever 14 | -------------------------------------------------------------------------------- /ansible/vars/kraken_vars.yml: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Ansible SSH variables. 3 | ############################################################################### 4 | ansible_public_key_file: "{{ lookup('env', 'PUBLIC_KEY')|default('~/.ssh/id_rsa.pub', true) }}" 5 | ansible_private_key_file: "{{ lookup('env', 'PRIVATE_KEY')|default('~/.ssh/id_rsa', true) }}" 6 | 7 | orchestration_user: "{{ lookup('env', 'ORCHESTRATION_USER')|default('root', true) }}" 8 | ############################################################################### 9 | 10 | # kube config location 11 | kubeconfig_path: "{{ lookup('env', 'KUBECONFIG_PATH')|default('~/.kube/config', true) }}" 12 | 13 | # kraken dir location on jump host 14 | kraken_dir: "{{ lookup('env', 'KRAKEN_DIR')|default('~/kraken', true) }}" 15 | 16 | # kraken config path location 17 | kraken_config: "{{ lookup('env', 'KRAKEN_CONFIG')|default('~/kraken/config/config.yaml', true) }}" 18 | 19 | # kraken repository location 20 | kraken_repository: "{{ lookup('env', 'KRAKEN_REPOSITORY')|default('https://github.com/openshift-scale/kraken.git', true) }}" 21 | 22 | # scenarios to inject 23 | scenarios_folder_path: "{{ lookup('env', 'SCENARIOS_FOLDER_PATH')|default('CI/scenarios/', true) }}" 24 | scenarios: "{{ lookup('env', 'SCENARIOS')|default('[[scenarios/etcd.yml, scenarios/post_action_etcd_example.sh], [scenarios/openshift-apiserver.yml, scenarios/post_action_openshift-kube-apiserver.yml], [scenarios/openshift-kube-apiserver.yml, scenarios/post_action_openshift-apiserver.yml], [scenarios/regex_openshift_pod_kill.yml, scenarios/post_action_regex.py]]', true) }}" 25 | 26 | exit_on_failure: "{{ lookup('env', 'EXIT_ON_FAILURE')|default(false, true) }}" 27 | 28 | # Cerberus enabled by user 29 | cerberus_enabled: "{{ lookup('env', 'CERBERUS_ENABLED')|default(false, true) }}" 30 | cerberus_url: "{{ lookup('env', 'CERBERUS_URL')|default('', true) }}" 31 | 32 | # Kraken configurations 33 | wait_duration: "{{ lookup('env', 'WAIT_DURATION')|default(60, true) }}" 34 | iterations: "{{ lookup('env', 'ITERATIONS')|default(1, true) }}" 35 | daemon_mode: "{{ lookup('env', 'DAEMON_MODE')|default(false, true) }}" 36 | -------------------------------------------------------------------------------- /config/cerberus.yaml: -------------------------------------------------------------------------------- 1 | cerberus: 2 | distribution: openshift # Distribution can be kubernetes or openshift 3 | kubeconfig_path: ~/.kube/config # Path to kubeconfig 4 | port: 8080 # http server port where cerberus status is published 5 | watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes 6 | watch_cluster_operators: True # Set to True for cerberus to monitor cluster operators 7 | watch_url_routes: # Route url's you want to monitor, this is a double array with the url and optional authorization parameter 8 | watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label. 9 | enabled: True 10 | label: node-role.kubernetes.io/master 11 | watch_namespaces: # List of namespaces to be monitored 12 | - openshift-etcd 13 | - openshift-apiserver 14 | - openshift-kube-apiserver 15 | - openshift-monitoring 16 | - openshift-kube-controller-manager 17 | - openshift-machine-api 18 | - openshift-kube-scheduler 19 | - openshift-ingress 20 | - openshift-sdn # When enabled, it will check for the cluster sdn and monitor that namespace 21 | cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status 22 | inspect_components: False # Enable it only when OpenShift client is supported to run 23 | # When enabled, cerberus collects logs, events and metrics of failed components 24 | 25 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. 26 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. 27 | # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies. 28 | 29 | slack_integration: False # When enabled, cerberus reports the failed iterations in the slack channel 30 | # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures ) 31 | # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's. 32 | watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.) 33 | Monday: 34 | Tuesday: 35 | Wednesday: 36 | Thursday: 37 | Friday: 38 | Saturday: 39 | Sunday: 40 | slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned 41 | 42 | custom_checks: # Relative paths of files conataining additional user defined checks 43 | 44 | tunings: 45 | timeout: 3 # Number of seconds before requests fail 46 | iterations: 5 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled 47 | sleep_time: 5 # Sleep duration between each iteration 48 | kube_api_request_chunk_size: 250 # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness. 49 | daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever 50 | cores_usage_percentage: 0.5 # Set the fraction of cores to be used for multiprocessing 51 | 52 | database: 53 | database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored 54 | reuse_database: False # When enabled, the database is reused to store the failures 55 | -------------------------------------------------------------------------------- /config/config_kind.yaml: -------------------------------------------------------------------------------- 1 | kraken: 2 | distribution: kubernetes # Distribution can be kubernetes or openshift 3 | kubeconfig_path: ~/.kube/config # Path to kubeconfig 4 | exit_on_failure: False # Exit when a post action scenario fails 5 | port: 8081 6 | publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081 7 | signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details 8 | signal_address: 0.0.0.0 # Signal listening address 9 | chaos_scenarios: # List of policies/chaos scenarios to load 10 | - plugin_scenarios: 11 | - scenarios/kind/scheduler.yml 12 | - node_scenarios: 13 | - scenarios/kind/node_scenarios_example.yml 14 | 15 | cerberus: 16 | cerberus_enabled: False # Enable it when cerberus is previously installed 17 | cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal 18 | check_applicaton_routes: False # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run 19 | 20 | performance_monitoring: 21 | deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift 22 | repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" 23 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. 24 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. 25 | uuid: # uuid for the run is generated by default if not set 26 | enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error 27 | alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries 28 | 29 | tunings: 30 | wait_duration: 60 # Duration to wait between each chaos scenario 31 | iterations: 1 # Number of times to execute the scenarios 32 | daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever 33 | -------------------------------------------------------------------------------- /config/config_kubernetes.yaml: -------------------------------------------------------------------------------- 1 | kraken: 2 | distribution: kubernetes # Distribution can be kubernetes or openshift 3 | kubeconfig_path: ~/.kube/config # Path to kubeconfig 4 | exit_on_failure: False # Exit when a post action scenario fails 5 | port: 8081 6 | publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081 7 | signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details 8 | chaos_scenarios: # List of policies/chaos scenarios to load 9 | - container_scenarios: # List of chaos pod scenarios to load 10 | - scenarios/kube/container_dns.yml 11 | - plugin_scenarios: 12 | - scenarios/kube/scheduler.yml 13 | 14 | cerberus: 15 | cerberus_enabled: False # Enable it when cerberus is previously installed 16 | cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal 17 | check_applicaton_routes: False # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run 18 | 19 | performance_monitoring: 20 | deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift 21 | repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" 22 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. 23 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. 24 | uuid: # uuid for the run is generated by default if not set 25 | enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error 26 | alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries 27 | check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down 28 | tunings: 29 | wait_duration: 60 # Duration to wait between each chaos scenario 30 | iterations: 1 # Number of times to execute the scenarios 31 | daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever 32 | -------------------------------------------------------------------------------- /config/metrics.yaml: -------------------------------------------------------------------------------- 1 | metrics: 2 | # API server 3 | - query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0 4 | metricName: schedulingThroughput 5 | 6 | # Containers & pod metrics 7 | - query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) 8 | metricName: podCPU 9 | 10 | - query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) 11 | metricName: podMemory 12 | 13 | - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0 14 | metricName: containerDiskUsage 15 | 16 | # Kubelet & CRI-O metrics 17 | - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} 18 | metricName: kubeletCPU 19 | 20 | - query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"} 21 | metricName: kubeletMemory 22 | 23 | - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} 24 | metricName: crioCPU 25 | 26 | - query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"} 27 | metricName: crioMemory 28 | 29 | # Node metrics 30 | - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 31 | metricName: nodeCPU-Masters 32 | 33 | - query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) 34 | metricName: nodeMemory-Masters 35 | 36 | - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0 37 | metricName: nodeCPU-Workers 38 | 39 | - query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) 40 | metricName: nodeMemory-Workers 41 | 42 | - query: avg(node_memory_MemAvailable_bytes) by (instance) 43 | metricName: nodeMemoryAvailable 44 | 45 | - query: avg(node_memory_Active_bytes) by (instance) 46 | metricName: nodeMemoryActive 47 | 48 | - query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) 49 | metricName: maxMemory-Masters 50 | 51 | - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) 52 | metricName: nodeMemoryCached+nodeMemoryBuffers 53 | 54 | - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) 55 | metricName: rxNetworkBytes 56 | 57 | - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) 58 | metricName: txNetworkBytes 59 | 60 | - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) 61 | metricName: nodeDiskWrittenBytes 62 | 63 | - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) 64 | metricName: nodeDiskReadBytes 65 | 66 | - query: sum(rate(etcd_server_leader_changes_seen_total[2m])) 67 | metricName: etcdLeaderChangesRate 68 | 69 | # Etcd metrics 70 | - query: etcd_server_is_leader > 0 71 | metricName: etcdServerIsLeader 72 | 73 | - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) 74 | metricName: 99thEtcdDiskBackendCommitDurationSeconds 75 | 76 | - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) 77 | metricName: 99thEtcdDiskWalFsyncDurationSeconds 78 | 79 | - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) 80 | metricName: 99thEtcdRoundTripTimeSeconds 81 | 82 | - query: etcd_mvcc_db_total_size_in_bytes 83 | metricName: etcdDBPhysicalSizeBytes 84 | 85 | - query: etcd_mvcc_db_total_size_in_use_in_bytes 86 | metricName: etcdDBLogicalSizeBytes 87 | 88 | - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0 89 | metricName: etcdObjectCount 90 | 91 | - query: sum by (cluster_version)(etcd_cluster_version) 92 | metricName: etcdVersion 93 | instant: true -------------------------------------------------------------------------------- /config/recommender_config.yaml: -------------------------------------------------------------------------------- 1 | application: openshift-etcd 2 | namespaces: openshift-etcd 3 | labels: app=openshift-etcd 4 | kubeconfig: ~/.kube/config.yaml 5 | prometheus_endpoint: 6 | auth_token: 7 | scrape_duration: 10m 8 | chaos_library: "kraken" 9 | log_level: INFO 10 | json_output_file: False 11 | json_output_folder_path: 12 | 13 | # for output purpose only do not change if not needed 14 | chaos_tests: 15 | GENERIC: 16 | - pod_failure 17 | - container_failure 18 | - node_failure 19 | - zone_outage 20 | - time_skew 21 | - namespace_failure 22 | - power_outage 23 | CPU: 24 | - node_cpu_hog 25 | NETWORK: 26 | - application_outage 27 | - node_network_chaos 28 | - pod_network_chaos 29 | MEM: 30 | - node_memory_hog 31 | - pvc_disk_fill 32 | 33 | threshold: .7 34 | cpu_threshold: .5 35 | mem_threshold: .5 36 | -------------------------------------------------------------------------------- /containers/Dockerfile.template: -------------------------------------------------------------------------------- 1 | # oc build 2 | FROM golang:1.23.1 AS oc-build 3 | RUN apt-get update && apt-get install -y --no-install-recommends libkrb5-dev 4 | WORKDIR /tmp 5 | RUN git clone --branch release-4.18 https://github.com/openshift/oc.git 6 | WORKDIR /tmp/oc 7 | RUN go mod edit -go 1.23.1 &&\ 8 | go get github.com/moby/buildkit@v0.12.5 &&\ 9 | go get github.com/containerd/containerd@v1.7.11&&\ 10 | go get github.com/docker/docker@v25.0.6&&\ 11 | go get github.com/opencontainers/runc@v1.1.14&&\ 12 | go get github.com/go-git/go-git/v5@v5.13.0&&\ 13 | go get golang.org/x/net@v0.36.0&&\ 14 | go get github.com/containerd/containerd@v1.7.27&&\ 15 | go get golang.org/x/oauth2@v0.27.0&&\ 16 | go get golang.org/x/crypto@v0.35.0&&\ 17 | go mod tidy && go mod vendor 18 | RUN make GO_REQUIRED_MIN_VERSION:= oc 19 | 20 | FROM fedora:40 21 | ARG PR_NUMBER 22 | ARG TAG 23 | RUN groupadd -g 1001 krkn && useradd -m -u 1001 -g krkn krkn 24 | RUN dnf update -y 25 | 26 | ENV KUBECONFIG /home/krkn/.kube/config 27 | 28 | 29 | # This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo 30 | RUN dnf update && dnf install -y --setopt=install_weak_deps=False \ 31 | git python39 jq yq gettext wget which &&\ 32 | dnf clean all 33 | 34 | # copy oc client binary from oc-build image 35 | COPY --from=oc-build /tmp/oc/oc /usr/bin/oc 36 | 37 | # krkn build 38 | RUN git clone https://github.com/krkn-chaos/krkn.git /home/krkn/kraken && \ 39 | mkdir -p /home/krkn/.kube 40 | 41 | WORKDIR /home/krkn/kraken 42 | 43 | # default behaviour will be to build main 44 | # if it is a PR trigger the PR itself will be checked out 45 | RUN if [ -n "$PR_NUMBER" ]; then git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} && git checkout pr-${PR_NUMBER};fi 46 | # if it is a TAG trigger checkout the tag 47 | RUN if [ -n "$TAG" ]; then git checkout "$TAG";fi 48 | 49 | RUN python3.9 -m ensurepip --upgrade --default-pip 50 | RUN python3.9 -m pip install --upgrade pip setuptools==70.0.0 51 | RUN pip3.9 install -r requirements.txt 52 | RUN pip3.9 install jsonschema 53 | 54 | LABEL krknctl.title.global="Krkn Base Image" 55 | LABEL krknctl.description.global="This is the krkn base image." 56 | LABEL krknctl.input_fields.global='$KRKNCTL_INPUT' 57 | 58 | 59 | RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn 60 | USER krkn 61 | ENTRYPOINT ["python3.9", "run_kraken.py"] 62 | CMD ["--config=config/config.yaml"] 63 | -------------------------------------------------------------------------------- /containers/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Kraken image 3 | 4 | Container image gets automatically built by quay.io at [Kraken image](https://quay.io/redhat-chaos/krkn). 5 | 6 | 7 | ### Run containerized version 8 | 9 | Refer [instructions](https://krkn-chaos.dev/docs/installation/) for information on how to run the containerized version of kraken. 10 | 11 | 12 | ### Run Custom Kraken Image 13 | 14 | Refer to [instructions](https://github.com/redhat-chaos/krkn/blob/main/containers/build_own_image-README.md) for information on how to run a custom containerized version of kraken using podman. 15 | -------------------------------------------------------------------------------- /containers/build_own_image-README.md: -------------------------------------------------------------------------------- 1 | # Building your own Kraken image 2 | 3 | 1. Git clone the Kraken repository using `git clone https://github.com/redhat-chaos/krkn.git`. 4 | 2. Modify the python code and yaml files to address your needs. 5 | 3. Execute `podman build -t :latest .` in the containers directory within kraken to build an image from a Dockerfile. 6 | 4. Execute `podman run --detach --name :latest` to start a container based on your new image. 7 | 8 | # Building the Kraken image on IBM Power (ppc64le) 9 | 10 | 1. Git clone the Kraken repository using `git clone https://github.com/redhat-chaos/krkn.git` on an IBM Power Systems server. 11 | 2. Modify the python code and yaml files to address your needs. 12 | 3. Execute `podman build -t :latest -f Dockerfile-ppc64le` in the containers directory within kraken to build an image from the Dockerfile for Power. 13 | 4. Execute `podman run --detach --name :latest` to start a container based on your new image. 14 | -------------------------------------------------------------------------------- /containers/compile_dockerfile.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | cd "$SCRIPT_DIR" 3 | export KRKNCTL_INPUT=$(cat krknctl-input.json|tr -d "\n") 4 | 5 | envsubst '${KRKNCTL_INPUT}' < Dockerfile.template > Dockerfile -------------------------------------------------------------------------------- /kind-config.yml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | nodes: 4 | - role: control-plane 5 | extraPortMappings: 6 | - containerPort: 30036 7 | hostPort: 8888 8 | - role: control-plane 9 | - role: control-plane 10 | - role: worker 11 | - role: worker 12 | -------------------------------------------------------------------------------- /krkn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/__init__.py -------------------------------------------------------------------------------- /krkn/cerberus/__init__.py: -------------------------------------------------------------------------------- 1 | from .setup import * 2 | -------------------------------------------------------------------------------- /krkn/chaos_recommender/__init__.py: -------------------------------------------------------------------------------- 1 | from .analysis import * 2 | from .kraken_tests import * 3 | from .prometheus import * -------------------------------------------------------------------------------- /krkn/chaos_recommender/kraken_tests.py: -------------------------------------------------------------------------------- 1 | def get_entries_by_category(filename, category): 2 | # Read the file 3 | with open(filename, "r") as file: 4 | content = file.read() 5 | 6 | # Split the content into sections based on the square brackets 7 | sections = content.split("\n\n") 8 | 9 | # Define the categories 10 | valid_categories = ["CPU", "NETWORK", "MEM", "GENERIC"] 11 | 12 | # Validate the provided category 13 | if category not in valid_categories: 14 | return [] 15 | 16 | # Find the section corresponding to the specified category 17 | target_section = None 18 | for section in sections: 19 | if section.startswith(f"[{category}]"): 20 | target_section = section 21 | break 22 | 23 | # If the category section was not found, return an empty list 24 | if target_section is None: 25 | return [] 26 | 27 | # Extract the entries from the category section 28 | entries = [ 29 | entry.strip() 30 | for entry in target_section.split("\n") 31 | if entry and not entry.startswith("[") 32 | ] 33 | 34 | return entries 35 | -------------------------------------------------------------------------------- /krkn/invoke/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/invoke/__init__.py -------------------------------------------------------------------------------- /krkn/invoke/command.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import logging 3 | import sys 4 | 5 | 6 | # Invokes a given command and returns the stdout 7 | def invoke(command, timeout=None): 8 | output = "" 9 | try: 10 | output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout) 11 | except Exception as e: 12 | logging.error("Failed to run %s, error: %s" % (command, e)) 13 | sys.exit(1) 14 | return output 15 | 16 | 17 | # Invokes a given command and returns the stdout 18 | def invoke_no_exit(command, timeout=None): 19 | output = "" 20 | try: 21 | output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout) 22 | logging.info("output " + str(output)) 23 | except Exception as e: 24 | logging.error("Failed to run %s, error: %s" % (command, e)) 25 | return str(e) 26 | return output 27 | 28 | 29 | def run(command): 30 | try: 31 | subprocess.run(command, shell=True, universal_newlines=True, timeout=45) 32 | except Exception: 33 | pass 34 | -------------------------------------------------------------------------------- /krkn/performance_dashboards/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/performance_dashboards/__init__.py -------------------------------------------------------------------------------- /krkn/performance_dashboards/setup.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import logging 3 | import git 4 | import sys 5 | 6 | 7 | # Installs a mutable grafana on the Kubernetes/OpenShift cluster and loads the performance dashboards 8 | def setup(repo, distribution): 9 | if distribution == "kubernetes": 10 | command = "cd performance-dashboards/dittybopper && ./k8s-deploy.sh" 11 | elif distribution == "openshift": 12 | command = "cd performance-dashboards/dittybopper && ./deploy.sh" 13 | else: 14 | logging.error("Provided distribution: %s is not supported" % (distribution)) 15 | sys.exit(1) 16 | delete_repo = "rm -rf performance-dashboards || exit 0" 17 | logging.info( 18 | "Cloning, installing mutable grafana on the cluster and loading the dashboards" 19 | ) 20 | try: 21 | # delete repo to clone the latest copy if exists 22 | subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45) 23 | # clone the repo 24 | git.Repo.clone_from(repo, "performance-dashboards") 25 | # deploy performance dashboards 26 | subprocess.run(command, shell=True, universal_newlines=True) 27 | except Exception as e: 28 | logging.error("Failed to install performance-dashboards, error: %s" % (e)) 29 | -------------------------------------------------------------------------------- /krkn/prometheus/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import * -------------------------------------------------------------------------------- /krkn/scenario_plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/application_outage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/application_outage/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import yaml 4 | from krkn_lib.models.telemetry import ScenarioTelemetry 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 6 | from krkn_lib.utils import get_yaml_item_value, get_random_string 7 | from jinja2 import Template 8 | from krkn import cerberus 9 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 10 | 11 | 12 | class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin): 13 | def run( 14 | self, 15 | run_uuid: str, 16 | scenario: str, 17 | krkn_config: dict[str, any], 18 | lib_telemetry: KrknTelemetryOpenshift, 19 | scenario_telemetry: ScenarioTelemetry, 20 | ) -> int: 21 | wait_duration = krkn_config["tunings"]["wait_duration"] 22 | try: 23 | with open(scenario, "r") as f: 24 | app_outage_config_yaml = yaml.full_load(f) 25 | scenario_config = app_outage_config_yaml["application_outage"] 26 | pod_selector = get_yaml_item_value( 27 | scenario_config, "pod_selector", "{}" 28 | ) 29 | traffic_type = get_yaml_item_value( 30 | scenario_config, "block", "[Ingress, Egress]" 31 | ) 32 | namespace = get_yaml_item_value(scenario_config, "namespace", "") 33 | duration = get_yaml_item_value(scenario_config, "duration", 60) 34 | 35 | start_time = int(time.time()) 36 | policy_name = f"krkn-deny-{get_random_string(5)}" 37 | 38 | network_policy_template = ( 39 | """--- 40 | apiVersion: networking.k8s.io/v1 41 | kind: NetworkPolicy 42 | metadata: 43 | name: """ 44 | + policy_name 45 | + """ 46 | spec: 47 | podSelector: 48 | matchLabels: {{ pod_selector }} 49 | policyTypes: {{ traffic_type }} 50 | """ 51 | ) 52 | t = Template(network_policy_template) 53 | rendered_spec = t.render( 54 | pod_selector=pod_selector, traffic_type=traffic_type 55 | ) 56 | yaml_spec = yaml.safe_load(rendered_spec) 57 | # Block the traffic by creating network policy 58 | logging.info("Creating the network policy") 59 | 60 | lib_telemetry.get_lib_kubernetes().create_net_policy( 61 | yaml_spec, namespace 62 | ) 63 | 64 | # wait for the specified duration 65 | logging.info( 66 | "Waiting for the specified duration in the config: %s" % duration 67 | ) 68 | time.sleep(duration) 69 | 70 | # unblock the traffic by deleting the network policy 71 | logging.info("Deleting the network policy") 72 | lib_telemetry.get_lib_kubernetes().delete_net_policy( 73 | policy_name, namespace 74 | ) 75 | 76 | logging.info( 77 | "End of scenario. Waiting for the specified duration: %s" 78 | % wait_duration 79 | ) 80 | time.sleep(wait_duration) 81 | 82 | end_time = int(time.time()) 83 | cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) 84 | except Exception as e: 85 | logging.error( 86 | "ApplicationOutageScenarioPlugin exiting due to Exception %s" % e 87 | ) 88 | return 1 89 | else: 90 | return 0 91 | 92 | def get_scenario_types(self) -> list[str]: 93 | return ["application_outages_scenarios"] 94 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/container/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/container/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/hogs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/hogs/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/managed_cluster/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/managed_cluster/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/managed_cluster/common_functions.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | from krkn_lib.k8s import KrknKubernetes 4 | 5 | 6 | # krkn_lib 7 | # Pick a random managedcluster with specified label selector 8 | def get_managedcluster( 9 | managedcluster_name, label_selector, instance_kill_count, kubecli: KrknKubernetes 10 | ): 11 | 12 | if managedcluster_name in kubecli.list_killable_managedclusters(): 13 | return [managedcluster_name] 14 | elif managedcluster_name: 15 | logging.info( 16 | "managedcluster with provided managedcluster_name does not exist or the managedcluster might " 17 | "be in unavailable state." 18 | ) 19 | managedclusters = kubecli.list_killable_managedclusters(label_selector) 20 | if not managedclusters: 21 | raise Exception( 22 | "Available managedclusters with the provided label selector do not exist" 23 | ) 24 | logging.info( 25 | "Available managedclusters with the label selector %s: %s" 26 | % (label_selector, managedclusters) 27 | ) 28 | number_of_managedclusters = len(managedclusters) 29 | if instance_kill_count == number_of_managedclusters: 30 | return managedclusters 31 | managedclusters_to_return = [] 32 | for i in range(instance_kill_count): 33 | managedcluster_to_add = managedclusters[ 34 | random.randint(0, len(managedclusters) - 1) 35 | ] 36 | managedclusters_to_return.append(managedcluster_to_add) 37 | managedclusters.remove(managedcluster_to_add) 38 | return managedclusters_to_return 39 | 40 | 41 | # Wait until the managedcluster status becomes Available 42 | # krkn_lib 43 | def wait_for_available_status(managedcluster, timeout, kubecli: KrknKubernetes): 44 | kubecli.watch_managedcluster_status(managedcluster, "True", timeout) 45 | 46 | 47 | # Wait until the managedcluster status becomes Not Available 48 | # krkn_lib 49 | def wait_for_unavailable_status(managedcluster, timeout, kubecli: KrknKubernetes): 50 | kubecli.watch_managedcluster_status(managedcluster, "Unknown", timeout) 51 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/native/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/native_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 2 | from krkn.scenario_plugins.native.plugins import PLUGINS 3 | from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool 4 | from krkn_lib.models.telemetry import ScenarioTelemetry 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 6 | from typing import Any 7 | import logging 8 | 9 | 10 | class NativeScenarioPlugin(AbstractScenarioPlugin): 11 | 12 | def run( 13 | self, 14 | run_uuid: str, 15 | scenario: str, 16 | krkn_config: dict[str, any], 17 | lib_telemetry: KrknTelemetryOpenshift, 18 | scenario_telemetry: ScenarioTelemetry, 19 | ) -> int: 20 | pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes()) 21 | kill_scenarios = [ 22 | kill_scenario 23 | for kill_scenario in PLUGINS.unserialize_scenario(scenario) 24 | if kill_scenario["id"] == "kill-pods" 25 | ] 26 | 27 | try: 28 | self.start_monitoring(pool, kill_scenarios) 29 | PLUGINS.run( 30 | scenario, 31 | lib_telemetry.get_lib_kubernetes().get_kubeconfig_path(), 32 | krkn_config, 33 | run_uuid, 34 | ) 35 | result = pool.join() 36 | scenario_telemetry.affected_pods = result 37 | if result.error: 38 | logging.error(f"NativeScenarioPlugin unrecovered pods: {result.error}") 39 | return 1 40 | 41 | except Exception as e: 42 | logging.error("NativeScenarioPlugin exiting due to Exception %s" % e) 43 | pool.cancel() 44 | return 1 45 | else: 46 | return 0 47 | 48 | def get_scenario_types(self) -> list[str]: 49 | return [ 50 | "pod_disruption_scenarios", 51 | "pod_network_scenarios", 52 | "ingress_node_scenarios" 53 | ] 54 | 55 | def start_monitoring(self, pool: PodsMonitorPool, scenarios: list[Any]): 56 | for kill_scenario in scenarios: 57 | recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"] 58 | if ( 59 | "namespace_pattern" in kill_scenario["config"] 60 | and "label_selector" in kill_scenario["config"] 61 | ): 62 | namespace_pattern = kill_scenario["config"]["namespace_pattern"] 63 | label_selector = kill_scenario["config"]["label_selector"] 64 | pool.select_and_monitor_by_namespace_pattern_and_label( 65 | namespace_pattern=namespace_pattern, 66 | label_selector=label_selector, 67 | max_timeout=recovery_time, 68 | ) 69 | logging.info( 70 | f"waiting {recovery_time} seconds for pod recovery, " 71 | f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}" 72 | ) 73 | 74 | elif ( 75 | "namespace_pattern" in kill_scenario["config"] 76 | and "name_pattern" in kill_scenario["config"] 77 | ): 78 | namespace_pattern = kill_scenario["config"]["namespace_pattern"] 79 | name_pattern = kill_scenario["config"]["name_pattern"] 80 | pool.select_and_monitor_by_name_pattern_and_namespace_pattern( 81 | pod_name_pattern=name_pattern, 82 | namespace_pattern=namespace_pattern, 83 | max_timeout=recovery_time, 84 | ) 85 | logging.info( 86 | f"waiting {recovery_time} seconds for pod recovery, " 87 | f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}" 88 | ) 89 | else: 90 | raise Exception( 91 | f"impossible to determine monitor parameters, check {kill_scenario} configuration" 92 | ) 93 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/network/job.j2: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: chaos-{{jobname}} 5 | spec: 6 | template: 7 | spec: 8 | nodeName: {{nodename}} 9 | hostNetwork: true 10 | containers: 11 | - name: networkchaos 12 | image: docker.io/fedora/tools 13 | command: ["/bin/sh", "-c", "{{cmd}}"] 14 | securityContext: 15 | privileged: true 16 | volumeMounts: 17 | - mountPath: /lib/modules 18 | name: lib-modules 19 | readOnly: true 20 | volumes: 21 | - name: lib-modules 22 | hostPath: 23 | path: /lib/modules 24 | restartPolicy: Never 25 | backoffLimit: 0 -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/network/pod_interface.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: fedtools 5 | spec: 6 | hostNetwork: true 7 | nodeName: {{nodename}} 8 | containers: 9 | - name: fedtools 10 | image: docker.io/fedora/tools 11 | command: 12 | - /bin/sh 13 | - -c 14 | - "trap : TERM INT; sleep infinity & wait" 15 | securityContext: 16 | privileged: true 17 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/network/pod_module.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: modtools 5 | spec: 6 | nodeName: {{nodename}} 7 | containers: 8 | - name: modtools 9 | image: docker.io/fedora/tools 10 | imagePullPolicy: IfNotPresent 11 | command: 12 | - /bin/sh 13 | - -c 14 | - "trap : TERM INT; sleep infinity & wait" 15 | tty: true 16 | stdin: true 17 | stdinOnce: true 18 | securityContext: 19 | privileged: true 20 | volumeMounts: 21 | - name: host 22 | mountPath: /host 23 | volumes: 24 | - name: host 25 | hostPath: 26 | path: / 27 | hostNetwork: true 28 | hostIPC: true 29 | hostPID: true 30 | restartPolicy: Never -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/pod_network_outage/job.j2: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: chaos-{{jobname}} 5 | spec: 6 | template: 7 | spec: 8 | nodeName: {{nodename}} 9 | hostNetwork: true 10 | containers: 11 | - name: networkchaos 12 | image: docker.io/fedora/tools 13 | command: ["chroot", "/host", "/bin/sh", "-c", "{{cmd}}"] 14 | securityContext: 15 | privileged: true 16 | volumeMounts: 17 | - name: host 18 | mountPath: /host 19 | volumes: 20 | - name: host 21 | hostPath: 22 | path: / 23 | 24 | restartPolicy: Never 25 | backoffLimit: 0 26 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/pod_network_outage/pod_module.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: modtools 5 | spec: 6 | nodeName: {{nodename}} 7 | containers: 8 | - name: modtools 9 | image: docker.io/fedora/tools 10 | imagePullPolicy: IfNotPresent 11 | command: 12 | - /bin/sh 13 | - -c 14 | - "trap : TERM INT; sleep infinity & wait" 15 | tty: true 16 | stdin: true 17 | stdinOnce: true 18 | securityContext: 19 | privileged: true 20 | volumeMounts: 21 | - name: host 22 | mountPath: /host 23 | volumes: 24 | - name: host 25 | hostPath: 26 | path: / 27 | hostNetwork: true 28 | hostIPC: true 29 | hostPID: true 30 | restartPolicy: Never -------------------------------------------------------------------------------- /krkn/scenario_plugins/native/run_python_plugin.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import subprocess 3 | import sys 4 | import typing 5 | 6 | from arcaflow_plugin_sdk import plugin 7 | 8 | 9 | @dataclasses.dataclass 10 | class RunPythonFileInput: 11 | filename: str 12 | 13 | 14 | @dataclasses.dataclass 15 | class RunPythonFileOutput: 16 | stdout: str 17 | stderr: str 18 | 19 | 20 | @dataclasses.dataclass 21 | class RunPythonFileError: 22 | exit_code: int 23 | stdout: str 24 | stderr: str 25 | 26 | 27 | @plugin.step( 28 | id="run_python", 29 | name="Run a Python script", 30 | description="Run a specified Python script", 31 | outputs={"success": RunPythonFileOutput, "error": RunPythonFileError} 32 | ) 33 | def run_python_file(params: RunPythonFileInput) -> typing.Tuple[ 34 | str, 35 | typing.Union[RunPythonFileOutput, RunPythonFileError] 36 | ]: 37 | run_results = subprocess.run( 38 | [sys.executable, params.filename], 39 | capture_output=True 40 | ) 41 | if run_results.returncode == 0: 42 | return "success", RunPythonFileOutput( 43 | str(run_results.stdout, 'utf-8'), 44 | str(run_results.stderr, 'utf-8') 45 | ) 46 | return "error", RunPythonFileError( 47 | run_results.returncode, 48 | str(run_results.stdout, 'utf-8'), 49 | str(run_results.stderr, 'utf-8') 50 | ) 51 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos/job.j2: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: chaos-{{jobname}} 5 | spec: 6 | template: 7 | spec: 8 | nodeName: {{nodename}} 9 | hostNetwork: true 10 | containers: 11 | - name: networkchaos 12 | image: docker.io/fedora/tools 13 | command: ["/bin/sh", "-c", "{{cmd}}"] 14 | securityContext: 15 | privileged: true 16 | volumeMounts: 17 | - mountPath: /lib/modules 18 | name: lib-modules 19 | readOnly: true 20 | volumes: 21 | - name: lib-modules 22 | hostPath: 23 | path: /lib/modules 24 | restartPolicy: Never 25 | backoffLimit: 0 26 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos/pod.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: fedtools 5 | spec: 6 | hostNetwork: true 7 | nodeName: {{nodename}} 8 | containers: 9 | - name: fedtools 10 | image: docker.io/fedora/tools 11 | command: 12 | - /bin/sh 13 | - -c 14 | - | 15 | sleep infinity 16 | securityContext: 17 | privileged: true 18 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos_ng/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | 4 | 5 | class NetworkChaosScenarioType(Enum): 6 | Node = 1 7 | Pod = 2 8 | 9 | @dataclass 10 | class BaseNetworkChaosConfig: 11 | supported_execution = ["serial", "parallel"] 12 | id: str 13 | wait_duration: int 14 | test_duration: int 15 | label_selector: str 16 | instance_count: int 17 | execution: str 18 | namespace: str 19 | 20 | def validate(self) -> list[str]: 21 | errors = [] 22 | if self.execution is None: 23 | errors.append(f"execution cannot be None, supported values are: {','.join(self.supported_execution)}") 24 | if self.execution not in self.supported_execution: 25 | errors.append(f"{self.execution} is not in supported execution mod: {','.join(self.supported_execution)}") 26 | if self.label_selector is None: 27 | errors.append("label_selector cannot be None") 28 | return errors 29 | 30 | @dataclass 31 | class NetworkFilterConfig(BaseNetworkChaosConfig): 32 | ingress: bool 33 | egress: bool 34 | interfaces: list[str] 35 | target: str 36 | ports: list[int] 37 | 38 | def validate(self) -> list[str]: 39 | errors = super().validate() 40 | # here further validations 41 | return errors 42 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos_ng/modules/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | import queue 4 | 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 6 | from krkn.scenario_plugins.network_chaos_ng.models import BaseNetworkChaosConfig, NetworkChaosScenarioType 7 | 8 | 9 | class AbstractNetworkChaosModule(abc.ABC): 10 | """ 11 | The abstract class that needs to be implemented by each Network Chaos Scenario 12 | """ 13 | @abc.abstractmethod 14 | def run(self, target: str, kubecli: KrknTelemetryOpenshift, error_queue: queue.Queue = None): 15 | """ 16 | the entrypoint method for the Network Chaos Scenario 17 | :param target: The resource name that will be targeted by the scenario (Node Name, Pod Name etc.) 18 | :param kubecli: The `KrknTelemetryOpenshift` needed by the scenario to access to the krkn-lib methods 19 | :param error_queue: A queue that will be used by the plugin to push the errors raised during the execution of parallel modules 20 | """ 21 | pass 22 | 23 | @abc.abstractmethod 24 | def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig): 25 | """ 26 | returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network 27 | Chaos Scenario that is running (Pod Scenario or Node Scenario) 28 | """ 29 | pass 30 | 31 | 32 | def log_info(self, message: str, parallel: bool = False, node_name: str = ""): 33 | """ 34 | log helper method for INFO severity to be used in the scenarios 35 | """ 36 | if parallel: 37 | logging.info(f"[{node_name}]: {message}") 38 | else: 39 | logging.info(message) 40 | 41 | def log_warning(self, message: str, parallel: bool = False, node_name: str = ""): 42 | """ 43 | log helper method for WARNING severity to be used in the scenarios 44 | """ 45 | if parallel: 46 | logging.warning(f"[{node_name}]: {message}") 47 | else: 48 | logging.warning(message) 49 | 50 | 51 | def log_error(self, message: str, parallel: bool = False, node_name: str = ""): 52 | """ 53 | log helper method for ERROR severity to be used in the scenarios 54 | """ 55 | if parallel: 56 | logging.error(f"[{node_name}]: {message}") 57 | else: 58 | logging.error(message) -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/modules/templates/network-chaos.j2: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: {{pod_name}} 5 | namespace: {{namespace}} 6 | spec: 7 | {% if host_network %} 8 | hostNetwork: true 9 | {%endif%} 10 | nodeSelector: 11 | kubernetes.io/hostname: {{target}} 12 | containers: 13 | - name: fedora 14 | imagePullPolicy: Always 15 | image: quay.io/krkn-chaos/krkn-network-chaos:latest 16 | securityContext: 17 | privileged: true 18 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py: -------------------------------------------------------------------------------- 1 | from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig 2 | from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import AbstractNetworkChaosModule 3 | from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import NodeNetworkFilterModule 4 | 5 | 6 | supported_modules = ["node_network_filter"] 7 | 8 | class NetworkChaosFactory: 9 | 10 | @staticmethod 11 | def get_instance(config: dict[str, str]) -> AbstractNetworkChaosModule: 12 | if config["id"] is None: 13 | raise Exception("network chaos id cannot be None") 14 | if config["id"] not in supported_modules: 15 | raise Exception(f"{config['id']} is not a supported network chaos module") 16 | 17 | if config["id"] == "node_network_filter": 18 | config = NetworkFilterConfig(**config) 19 | errors = config.validate() 20 | if len(errors) > 0: 21 | raise Exception(f"config validation errors: [{';'.join(errors)}]") 22 | return NodeNetworkFilterModule(config) 23 | 24 | 25 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/network_chaos_ng/network_chaos_ng_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import queue 3 | import random 4 | import threading 5 | import time 6 | 7 | import yaml 8 | from krkn_lib.models.telemetry import ScenarioTelemetry 9 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 10 | 11 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 12 | from krkn.scenario_plugins.network_chaos_ng.models import ( 13 | NetworkChaosScenarioType, 14 | BaseNetworkChaosConfig, 15 | ) 16 | from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import ( 17 | AbstractNetworkChaosModule, 18 | ) 19 | from krkn.scenario_plugins.network_chaos_ng.network_chaos_factory import ( 20 | NetworkChaosFactory, 21 | ) 22 | 23 | 24 | class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin): 25 | def run( 26 | self, 27 | run_uuid: str, 28 | scenario: str, 29 | krkn_config: dict[str, any], 30 | lib_telemetry: KrknTelemetryOpenshift, 31 | scenario_telemetry: ScenarioTelemetry, 32 | ) -> int: 33 | try: 34 | with open(scenario, "r") as file: 35 | scenario_config = yaml.safe_load(file) 36 | if not isinstance(scenario_config, list): 37 | logging.error( 38 | "network chaos scenario config must be a list of objects" 39 | ) 40 | return 1 41 | for config in scenario_config: 42 | network_chaos = NetworkChaosFactory.get_instance(config) 43 | network_chaos_config = network_chaos.get_config() 44 | logging.info( 45 | f"running network_chaos scenario: {network_chaos_config[1].id}" 46 | ) 47 | if network_chaos_config[0] == NetworkChaosScenarioType.Node: 48 | targets = lib_telemetry.get_lib_kubernetes().list_nodes( 49 | network_chaos_config[1].label_selector 50 | ) 51 | else: 52 | targets = lib_telemetry.get_lib_kubernetes().list_pods( 53 | network_chaos_config[1].namespace, 54 | network_chaos_config[1].label_selector, 55 | ) 56 | if len(targets) == 0: 57 | logging.warning( 58 | f"no targets found for {network_chaos_config[1].id} " 59 | f"network chaos scenario with selector {network_chaos_config[1].label_selector} " 60 | f"with target type {network_chaos_config[0]}" 61 | ) 62 | 63 | if network_chaos_config[1].instance_count != 0 and network_chaos_config[1].instance_count > len(targets): 64 | targets = random.sample(targets, network_chaos_config[1].instance_count) 65 | 66 | if network_chaos_config[1].execution == "parallel": 67 | self.run_parallel(targets, network_chaos, lib_telemetry) 68 | else: 69 | self.run_serial(targets, network_chaos, lib_telemetry) 70 | if len(config) > 1: 71 | logging.info(f"waiting {network_chaos_config[1].wait_duration} seconds before running the next " 72 | f"Network Chaos NG Module") 73 | time.sleep(network_chaos_config[1].wait_duration) 74 | except Exception as e: 75 | logging.error(str(e)) 76 | return 1 77 | return 0 78 | 79 | def run_parallel( 80 | self, 81 | targets: list[str], 82 | module: AbstractNetworkChaosModule, 83 | lib_telemetry: KrknTelemetryOpenshift, 84 | ): 85 | error_queue = queue.Queue() 86 | threads = [] 87 | errors = [] 88 | for target in targets: 89 | thread = threading.Thread( 90 | target=module.run, args=[target, lib_telemetry, error_queue] 91 | ) 92 | thread.start() 93 | threads.append(thread) 94 | for thread in threads: 95 | thread.join() 96 | while True: 97 | try: 98 | errors.append(error_queue.get_nowait()) 99 | except queue.Empty: 100 | break 101 | if len(errors) > 0: 102 | raise Exception( 103 | f"module {module.get_config()[1].id} execution failed: [{';'.join(errors)}]" 104 | ) 105 | 106 | def run_serial( 107 | self, 108 | targets: list[str], 109 | module: AbstractNetworkChaosModule, 110 | lib_telemetry: KrknTelemetryOpenshift, 111 | ): 112 | for target in targets: 113 | module.run(target, lib_telemetry) 114 | 115 | def get_scenario_types(self) -> list[str]: 116 | return ["network_chaos_ng_scenarios"] 117 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/node_actions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/node_actions/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/node_actions/common_node_functions.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import random 4 | import logging 5 | import paramiko 6 | from krkn_lib.models.k8s import AffectedNode 7 | import krkn.invoke.command as runcommand 8 | from krkn_lib.k8s import KrknKubernetes 9 | from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus 10 | from krkn_lib.models.k8s import AffectedNode 11 | 12 | node_general = False 13 | 14 | 15 | def get_node_by_name(node_name_list, kubecli: KrknKubernetes): 16 | killable_nodes = kubecli.list_killable_nodes() 17 | for node_name in node_name_list: 18 | if node_name not in killable_nodes: 19 | logging.info( 20 | f"Node with provided ${node_name} does not exist or the node might " 21 | "be in NotReady state." 22 | ) 23 | return 24 | return node_name_list 25 | 26 | 27 | # Pick a random node with specified label selector 28 | def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes): 29 | 30 | label_selector_list = label_selector.split(",") 31 | nodes = [] 32 | for label_selector in label_selector_list: 33 | nodes.extend(kubecli.list_killable_nodes(label_selector)) 34 | if not nodes: 35 | raise Exception("Ready nodes with the provided label selector do not exist") 36 | logging.info("Ready nodes with the label selector %s: %s" % (label_selector_list, nodes)) 37 | number_of_nodes = len(nodes) 38 | if instance_kill_count == number_of_nodes: 39 | return nodes 40 | nodes_to_return = [] 41 | for i in range(instance_kill_count): 42 | node_to_add = nodes[random.randint(0, len(nodes) - 1)] 43 | nodes_to_return.append(node_to_add) 44 | nodes.remove(node_to_add) 45 | return nodes_to_return 46 | 47 | # krkn_lib 48 | # Wait until the node status becomes Ready 49 | def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None): 50 | affected_node = kubecli.watch_node_status(node, "True", timeout, affected_node) 51 | return affected_node 52 | 53 | 54 | # krkn_lib 55 | # Wait until the node status becomes Not Ready 56 | def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None): 57 | affected_node = kubecli.watch_node_status(node, "False", timeout, affected_node) 58 | return affected_node 59 | 60 | 61 | # krkn_lib 62 | # Wait until the node status becomes Unknown 63 | def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None): 64 | affected_node = kubecli.watch_node_status(node, "Unknown", timeout, affected_node) 65 | return affected_node 66 | 67 | 68 | # Get the ip of the cluster node 69 | def get_node_ip(node): 70 | return runcommand.invoke( 71 | "kubectl get node %s -o " 72 | "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node) 73 | ) 74 | 75 | 76 | def check_service_status(node, service, ssh_private_key, timeout): 77 | ssh = paramiko.SSHClient() 78 | ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 79 | i = 0 80 | sleeper = 1 81 | while i <= timeout: 82 | try: 83 | time.sleep(sleeper) 84 | i += sleeper 85 | logging.info("Trying to ssh to instance: %s" % (node)) 86 | connection = ssh.connect( 87 | node, 88 | username="root", 89 | key_filename=ssh_private_key, 90 | timeout=800, 91 | banner_timeout=400, 92 | ) 93 | if connection is None: 94 | break 95 | except Exception as e: 96 | logging.error( 97 | "Failed to ssh to instance: %s within the timeout duration of %s: %s" 98 | % (node, timeout, e) 99 | ) 100 | 101 | for service_name in service: 102 | logging.info("Checking status of Service: %s" % (service_name)) 103 | stdin, stdout, stderr = ssh.exec_command( 104 | "systemctl status %s | grep '^ Active' " 105 | "| awk '{print $2}'" % (service_name) 106 | ) 107 | service_status = stdout.readlines()[0] 108 | logging.info( 109 | "Status of service %s is %s \n" % (service_name, service_status.strip()) 110 | ) 111 | if service_status.strip() != "active": 112 | logging.error( 113 | "Service %s is in %s state" % (service_name, service_status.strip()) 114 | ) 115 | ssh.close() 116 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( 3 | abstract_node_scenarios, 4 | ) 5 | from krkn_lib.k8s import KrknKubernetes 6 | from krkn_lib.models.k8s import AffectedNodeStatus 7 | 8 | class GENERAL: 9 | def __init__(self): 10 | pass 11 | 12 | 13 | # krkn_lib 14 | class general_node_scenarios(abstract_node_scenarios): 15 | def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus): 16 | super().__init__(kubecli, affected_nodes_status) 17 | self.general = GENERAL() 18 | 19 | # Node scenario to start the node 20 | def node_start_scenario(self, instance_kill_count, node, timeout): 21 | logging.info( 22 | "Node start is not set up yet for this cloud type, " 23 | "no action is going to be taken" 24 | ) 25 | 26 | # Node scenario to stop the node 27 | def node_stop_scenario(self, instance_kill_count, node, timeout): 28 | logging.info( 29 | "Node stop is not set up yet for this cloud type," 30 | " no action is going to be taken" 31 | ) 32 | 33 | # Node scenario to terminate the node 34 | def node_termination_scenario(self, instance_kill_count, node, timeout): 35 | logging.info( 36 | "Node termination is not set up yet for this cloud type, " 37 | "no action is going to be taken" 38 | ) 39 | 40 | # Node scenario to reboot the node 41 | def node_reboot_scenario(self, instance_kill_count, node, timeout): 42 | logging.info( 43 | "Node reboot is not set up yet for this cloud type," 44 | " no action is going to be taken" 45 | ) 46 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/pvc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/pvc/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/service_disruption/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/service_disruption/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/service_hijacking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/service_hijacking/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import yaml 5 | from krkn_lib.models.telemetry import ScenarioTelemetry 6 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 7 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 8 | 9 | 10 | class ServiceHijackingScenarioPlugin(AbstractScenarioPlugin): 11 | def run( 12 | self, 13 | run_uuid: str, 14 | scenario: str, 15 | krkn_config: dict[str, any], 16 | lib_telemetry: KrknTelemetryOpenshift, 17 | scenario_telemetry: ScenarioTelemetry, 18 | ) -> int: 19 | with open(scenario) as stream: 20 | scenario_config = yaml.safe_load(stream) 21 | 22 | service_name = scenario_config["service_name"] 23 | service_namespace = scenario_config["service_namespace"] 24 | plan = scenario_config["plan"] 25 | image = scenario_config["image"] 26 | target_port = scenario_config["service_target_port"] 27 | chaos_duration = scenario_config["chaos_duration"] 28 | 29 | logging.info( 30 | f"checking service {service_name} in namespace: {service_namespace}" 31 | ) 32 | if not lib_telemetry.get_lib_kubernetes().service_exists( 33 | service_name, service_namespace 34 | ): 35 | logging.error( 36 | f"ServiceHijackingScenarioPlugin service: {service_name} not found in namespace: {service_namespace}, failed to run scenario." 37 | ) 38 | return 1 39 | try: 40 | logging.info( 41 | f"service: {service_name} found in namespace: {service_namespace}" 42 | ) 43 | logging.info(f"creating webservice and initializing test plan...") 44 | # both named ports and port numbers can be used 45 | if isinstance(target_port, int): 46 | logging.info(f"webservice will listen on port {target_port}") 47 | webservice = ( 48 | lib_telemetry.get_lib_kubernetes().deploy_service_hijacking( 49 | service_namespace, plan, image, port_number=target_port 50 | ) 51 | ) 52 | else: 53 | logging.info(f"traffic will be redirected to named port: {target_port}") 54 | webservice = ( 55 | lib_telemetry.get_lib_kubernetes().deploy_service_hijacking( 56 | service_namespace, plan, image, port_name=target_port 57 | ) 58 | ) 59 | logging.info( 60 | f"successfully deployed pod: {webservice.pod_name} " 61 | f"in namespace:{service_namespace} with selector {webservice.selector}!" 62 | ) 63 | logging.info( 64 | f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}" 65 | ) 66 | original_service = ( 67 | lib_telemetry.get_lib_kubernetes().replace_service_selector( 68 | [webservice.selector], service_name, service_namespace 69 | ) 70 | ) 71 | if original_service is None: 72 | logging.error( 73 | f"ServiceHijackingScenarioPlugin failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}" 74 | ) 75 | return 1 76 | 77 | logging.info(f"service: {service_name} successfully patched!") 78 | logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}") 79 | logging.info(f"waiting {chaos_duration} before restoring the service") 80 | time.sleep(chaos_duration) 81 | selectors = [ 82 | "=".join([key, original_service["spec"]["selector"][key]]) 83 | for key in original_service["spec"]["selector"].keys() 84 | ] 85 | logging.info(f"restoring the service selectors {selectors}") 86 | original_service = ( 87 | lib_telemetry.get_lib_kubernetes().replace_service_selector( 88 | selectors, service_name, service_namespace 89 | ) 90 | ) 91 | if original_service is None: 92 | logging.error( 93 | f"ServiceHijackingScenarioPlugin failed to restore original " 94 | f"service: {service_name}, namespace: {service_namespace} with selectors: {selectors}" 95 | ) 96 | return 1 97 | logging.info("selectors successfully restored") 98 | logging.info("undeploying service-hijacking resources...") 99 | lib_telemetry.get_lib_kubernetes().undeploy_service_hijacking(webservice) 100 | return 0 101 | except Exception as e: 102 | logging.error( 103 | f"ServiceHijackingScenarioPlugin scenario {scenario} failed with exception: {e}" 104 | ) 105 | return 1 106 | 107 | def get_scenario_types(self) -> list[str]: 108 | return ["service_hijacking_scenarios"] 109 | -------------------------------------------------------------------------------- /krkn/scenario_plugins/shut_down/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/shut_down/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/syn_flood/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/syn_flood/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/time_actions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/time_actions/__init__.py -------------------------------------------------------------------------------- /krkn/scenario_plugins/zone_outage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/zone_outage/__init__.py -------------------------------------------------------------------------------- /krkn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/tests/__init__.py -------------------------------------------------------------------------------- /krkn/tests/test_classes/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 7 | 8 | 9 | class WrongModuleScenarioPlugin(AbstractScenarioPlugin): 10 | def run( 11 | self, 12 | run_uuid: str, 13 | scenario: str, 14 | krkn_config: dict[str, any], 15 | lib_telemetry: KrknTelemetryOpenshift, 16 | scenario_telemetry: ScenarioTelemetry, 17 | ) -> int: 18 | pass 19 | 20 | def get_scenario_types(self) -> list[str]: 21 | pass 22 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/correct_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 7 | 8 | 9 | class CorrectScenarioPlugin(AbstractScenarioPlugin): 10 | 11 | def run( 12 | self, 13 | run_uuid: str, 14 | scenario: str, 15 | krkn_config: dict[str, any], 16 | lib_telemetry: KrknTelemetryOpenshift, 17 | scenario_telemetry: ScenarioTelemetry, 18 | ) -> int: 19 | pass 20 | 21 | def get_scenario_types(self) -> list[str]: 22 | return ["correct_scenarios", "scenarios_correct"] 23 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/duplicated_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from krkn_lib.models.telemetry import ScenarioTelemetry 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 3 | 4 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 5 | 6 | 7 | class DuplicatedScenarioPlugin(AbstractScenarioPlugin): 8 | 9 | def run( 10 | self, 11 | run_uuid: str, 12 | scenario: str, 13 | krkn_config: dict[str, any], 14 | lib_telemetry: KrknTelemetryOpenshift, 15 | scenario_telemetry: ScenarioTelemetry, 16 | ) -> int: 17 | pass 18 | 19 | def get_scenario_types(self) -> list[str]: 20 | return ["another_irrelevant_scenario", "duplicated_scenario"] 21 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/duplicated_two_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from krkn_lib.models.telemetry import ScenarioTelemetry 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 3 | 4 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 5 | 6 | 7 | class DuplicatedTwoScenarioPlugin(AbstractScenarioPlugin): 8 | 9 | def run( 10 | self, 11 | run_uuid: str, 12 | scenario: str, 13 | krkn_config: dict[str, any], 14 | lib_telemetry: KrknTelemetryOpenshift, 15 | scenario_telemetry: ScenarioTelemetry, 16 | ) -> int: 17 | pass 18 | 19 | def get_scenario_types(self) -> list[str]: 20 | return ["duplicated_scenario", "irellevant_scenario"] 21 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/example_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from krkn_lib.models.telemetry import ScenarioTelemetry 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 3 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 4 | 5 | 6 | # Each plugin must extend the AbstractScenarioPlugin abstract class 7 | # and implement its methods. Also the naming conventions must be respected 8 | # you can refer to the documentation for the details: 9 | # https://github.com/krkn-chaos/krkn/blob/main/docs/scenario_plugin_api.md 10 | class ExampleScenarioPlugin(AbstractScenarioPlugin): 11 | 12 | def run( 13 | self, 14 | run_uuid: str, 15 | scenario: str, 16 | krkn_config: dict[str, any], 17 | lib_telemetry: KrknTelemetryOpenshift, 18 | scenario_telemetry: ScenarioTelemetry, 19 | ) -> int: 20 | """ 21 | :param run_uuid: the uuid of the chaos run generated by krkn for every single run 22 | :param scenario: the config file of the scenario that is currently executed 23 | :param krkn_config: the full dictionary representation of the `config.yaml` 24 | :param lib_telemetry: it is a composite object of all the 25 | [krkn-lib](https://krkn-chaos.github.io/krkn-lib-docs/modules.html) 26 | objects and methods needed by a krkn plugin to run. 27 | :param scenario_telemetry: the `ScenarioTelemetry` object of the scenario that is currently executed 28 | """ 29 | 30 | pass 31 | 32 | try: 33 | # The scenario logic for each scenario must be placed 34 | # here. A try-except it is needed to catch exceptions 35 | # that may occur in this section and they shouldn't 36 | # be propagated outside (only int return value is admitted). 37 | 38 | # krkn-lib KrknKubernetes object containing all the kubernetes primitives 39 | # can be retrieved by the KrknTelemetryOpenshift object 40 | krkn_kubernetes = lib_telemetry.get_lib_kubernetes() 41 | 42 | # krkn-lib KrknOpenshift object containing all the OCP primitives 43 | # can be retrieved by the KrknTelemetryOpenshift object 44 | krkn_openshift = lib_telemetry.get_lib_ocp() 45 | 46 | # if the scenario succeeds the telemetry exit status is 0 47 | return 0 48 | except Exception as e: 49 | # if the scenario fails the telemetry exit status is 1 50 | return 1 51 | 52 | # Reflects the scenario type defined in the config.yaml 53 | # in the chaos_scenarios section and to which each class 54 | # responds. 55 | def get_scenario_types(self) -> list[str]: 56 | return ["example_scenarios"] 57 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/snake_case_mismatch_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 7 | 8 | 9 | class SnakeMismatchScenarioPlugin(AbstractScenarioPlugin): 10 | 11 | def run( 12 | self, 13 | run_uuid: str, 14 | scenario: str, 15 | krkn_config: dict[str, any], 16 | lib_telemetry: KrknTelemetryOpenshift, 17 | scenario_telemetry: ScenarioTelemetry, 18 | ) -> int: 19 | pass 20 | 21 | def get_scenario_types(self) -> list[str]: 22 | pass 23 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/wrong_classname_scenario_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 7 | 8 | 9 | class WrongClassNamePlugin(AbstractScenarioPlugin): 10 | 11 | def run( 12 | self, 13 | run_uuid: str, 14 | scenario: str, 15 | krkn_config: dict[str, any], 16 | lib_telemetry: KrknTelemetryOpenshift, 17 | scenario_telemetry: ScenarioTelemetry, 18 | ) -> int: 19 | pass 20 | 21 | def get_scenario_types(self) -> list[str]: 22 | pass 23 | -------------------------------------------------------------------------------- /krkn/tests/test_classes/wrong_module.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 7 | 8 | 9 | class WrongModuleScenarioPlugin(AbstractScenarioPlugin): 10 | 11 | def run( 12 | self, 13 | run_uuid: str, 14 | scenario: str, 15 | krkn_config: dict[str, any], 16 | lib_telemetry: KrknTelemetryOpenshift, 17 | scenario_telemetry: ScenarioTelemetry, 18 | ) -> int: 19 | pass 20 | 21 | def get_scenario_types(self) -> list[str]: 22 | pass 23 | -------------------------------------------------------------------------------- /krkn/tests/test_plugin_factory.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin 4 | from krkn.scenario_plugins.scenario_plugin_factory import ScenarioPluginFactory 5 | from krkn.tests.test_classes.correct_scenario_plugin import ( 6 | CorrectScenarioPlugin, 7 | ) 8 | 9 | 10 | class TestPluginFactory(unittest.TestCase): 11 | 12 | def test_plugin_factory(self): 13 | factory = ScenarioPluginFactory("krkn.tests.test_classes") 14 | self.assertEqual(len(factory.loaded_plugins), 5) 15 | self.assertEqual(len(factory.failed_plugins), 4) 16 | self.assertIs( 17 | factory.loaded_plugins["correct_scenarios"].__base__, 18 | AbstractScenarioPlugin, 19 | ) 20 | self.assertTrue( 21 | isinstance( 22 | factory.loaded_plugins["correct_scenarios"](), CorrectScenarioPlugin 23 | ) 24 | ) 25 | # soLid 26 | self.assertTrue( 27 | isinstance( 28 | factory.loaded_plugins["correct_scenarios"](), AbstractScenarioPlugin 29 | ) 30 | ) 31 | 32 | self.assertTrue( 33 | "krkn.tests.test_classes.snake_case_mismatch_scenario_plugin" 34 | in [p[0] for p in factory.failed_plugins] 35 | ) 36 | self.assertTrue( 37 | "krkn.tests.test_classes.wrong_classname_scenario_plugin" 38 | in [p[0] for p in factory.failed_plugins] 39 | ) 40 | self.assertTrue( 41 | "krkn.tests.test_classes.wrong_module" 42 | in [p[0] for p in factory.failed_plugins] 43 | ) 44 | 45 | def test_plugin_factory_naming_convention(self): 46 | factory = ScenarioPluginFactory() 47 | correct_module_name = "krkn.scenario_plugins.example.correct_scenario_plugin" 48 | correct_class_name = "CorrectScenarioPlugin" 49 | correct_class_name_no_match = "NoMatchScenarioPlugin" 50 | wrong_module_name = "krkn.scenario_plugins.example.correct_plugin" 51 | wrong_class_name = "WrongScenario" 52 | wrong_folder_name_plugin = ( 53 | "krkn.scenario_plugins.example_plugin.example_plugin_scenario_plugin" 54 | ) 55 | wrong_folder_name_plugin_class_name = "ExamplePluginScenarioPlugin" 56 | wrong_folder_name_scenario = ( 57 | "krkn.scenario_plugins.example_scenario.example_scenario_scenario_plugin" 58 | ) 59 | wrong_folder_name_scenario_class_name = "ExampleScenarioScenarioPlugin" 60 | 61 | result, message = factory.is_naming_convention_correct( 62 | correct_module_name, correct_class_name 63 | ) 64 | self.assertTrue(result) 65 | self.assertIsNone(message) 66 | 67 | result, message = factory.is_naming_convention_correct( 68 | wrong_module_name, correct_class_name 69 | ) 70 | self.assertFalse(result) 71 | self.assertEqual( 72 | message, 73 | "scenario plugin module file names must end with `_scenario_plugin` suffix", 74 | ) 75 | 76 | result, message = factory.is_naming_convention_correct( 77 | correct_module_name, wrong_class_name 78 | ) 79 | self.assertFalse(result) 80 | self.assertEqual( 81 | message, 82 | "scenario plugin class name must start with a capital letter, " 83 | "end with `ScenarioPlugin`, and cannot be just `ScenarioPlugin`.", 84 | ) 85 | 86 | result, message = factory.is_naming_convention_correct( 87 | correct_module_name, correct_class_name_no_match 88 | ) 89 | self.assertFalse(result) 90 | self.assertEqual( 91 | message, 92 | "module file name must in snake case must match class name in capital camel case " 93 | "e.g. `example_scenario_plugin` -> `ExampleScenarioPlugin`", 94 | ) 95 | 96 | result, message = factory.is_naming_convention_correct( 97 | wrong_folder_name_plugin, wrong_folder_name_plugin_class_name 98 | ) 99 | self.assertFalse(result) 100 | self.assertEqual( 101 | message, "scenario plugin folder cannot contain `scenario` or `plugin` word" 102 | ) 103 | 104 | result, message = factory.is_naming_convention_correct( 105 | wrong_folder_name_scenario, wrong_folder_name_scenario_class_name 106 | ) 107 | self.assertFalse(result) 108 | self.assertEqual( 109 | message, "scenario plugin folder cannot contain `scenario` or `plugin` word" 110 | ) 111 | -------------------------------------------------------------------------------- /krkn/utils/HealthChecker.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import logging 4 | import queue 5 | from datetime import datetime 6 | from krkn_lib.models.telemetry.models import HealthCheck 7 | 8 | class HealthChecker: 9 | current_iterations: int = 0 10 | ret_value = 0 11 | def __init__(self, iterations): 12 | self.iterations = iterations 13 | 14 | def make_request(self, url, auth=None, headers=None, verify=True): 15 | response_data = {} 16 | response = requests.get(url, auth=auth, headers=headers, verify=verify) 17 | response_data["url"] = url 18 | response_data["status"] = response.status_code == 200 19 | response_data["status_code"] = response.status_code 20 | return response_data 21 | 22 | 23 | def run_health_check(self, health_check_config, health_check_telemetry_queue: queue.Queue): 24 | if health_check_config and health_check_config["config"] and any(config.get("url") for config in health_check_config["config"]): 25 | health_check_start_time_stamp = datetime.now() 26 | health_check_telemetry = [] 27 | health_check_tracker = {} 28 | interval = health_check_config["interval"] if health_check_config["interval"] else 2 29 | 30 | response_tracker = {config["url"]:True for config in health_check_config["config"]} 31 | while self.current_iterations < self.iterations: 32 | for config in health_check_config.get("config"): 33 | auth, headers = None, None 34 | verify_url = config["verify_url"] if "verify_url" in config else True 35 | if config["url"]: url = config["url"] 36 | 37 | if config["bearer_token"]: 38 | bearer_token = "Bearer " + config["bearer_token"] 39 | headers = {"Authorization": bearer_token} 40 | 41 | if config["auth"]: auth = tuple(config["auth"].split(',')) 42 | response = self.make_request(url, auth, headers, verify_url) 43 | 44 | if response["status_code"] != 200: 45 | if config["url"] not in health_check_tracker: 46 | start_timestamp = datetime.now() 47 | health_check_tracker[config["url"]] = { 48 | "status_code": response["status_code"], 49 | "start_timestamp": start_timestamp 50 | } 51 | if response_tracker[config["url"]] != False: response_tracker[config["url"]] = False 52 | if config["exit_on_failure"] and config["exit_on_failure"] == True and self.ret_value==0: self.ret_value = 2 53 | else: 54 | if config["url"] in health_check_tracker: 55 | end_timestamp = datetime.now() 56 | start_timestamp = health_check_tracker[config["url"]]["start_timestamp"] 57 | previous_status_code = str(health_check_tracker[config["url"]]["status_code"]) 58 | duration = (end_timestamp - start_timestamp).total_seconds() 59 | downtime_record = { 60 | "url": config["url"], 61 | "status": False, 62 | "status_code": previous_status_code, 63 | "start_timestamp": start_timestamp.isoformat(), 64 | "end_timestamp": end_timestamp.isoformat(), 65 | "duration": duration 66 | } 67 | health_check_telemetry.append(HealthCheck(downtime_record)) 68 | del health_check_tracker[config["url"]] 69 | time.sleep(interval) 70 | health_check_end_time_stamp = datetime.now() 71 | for url, status in response_tracker.items(): 72 | if status == True: 73 | duration = (health_check_end_time_stamp - health_check_start_time_stamp).total_seconds() 74 | success_response = { 75 | "url": url, 76 | "status": True, 77 | "status_code": 200, 78 | "start_timestamp": health_check_start_time_stamp.isoformat(), 79 | "end_timestamp": health_check_end_time_stamp.isoformat(), 80 | "duration": duration 81 | } 82 | health_check_telemetry.append(HealthCheck(success_response)) 83 | health_check_telemetry_queue.put(health_check_telemetry) 84 | else: 85 | logging.info("health checks config is not defined, skipping them") -------------------------------------------------------------------------------- /krkn/utils/TeeLogHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | class TeeLogHandler(logging.Handler): 3 | logs: list[str] = [] 4 | name = "TeeLogHandler" 5 | 6 | def get_output(self) -> str: 7 | return "\n".join(self.logs) 8 | 9 | def emit(self, record): 10 | self.logs.append(self.formatter.format(record)) 11 | def __del__(self): 12 | pass -------------------------------------------------------------------------------- /krkn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .TeeLogHandler import TeeLogHandler 2 | from .functions import * 3 | -------------------------------------------------------------------------------- /krkn/utils/functions.py: -------------------------------------------------------------------------------- 1 | import krkn_lib.utils 2 | from krkn_lib.k8s import KrknKubernetes 3 | from krkn_lib.models.telemetry import ScenarioTelemetry 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift 5 | from tzlocal.unix import get_localzone 6 | import logging 7 | 8 | def populate_cluster_events( 9 | krkn_config: dict, 10 | scenario_config: dict, 11 | kubecli: KrknKubernetes, 12 | start_timestamp: int, 13 | end_timestamp: int, 14 | ): 15 | events = [] 16 | namespaces = __retrieve_namespaces(scenario_config, kubecli) 17 | 18 | if len(namespaces) == 0: 19 | events.extend( 20 | kubecli.collect_and_parse_cluster_events( 21 | start_timestamp, end_timestamp, str(get_localzone()) 22 | ) 23 | ) 24 | else: 25 | for namespace in namespaces: 26 | events.extend( 27 | kubecli.collect_and_parse_cluster_events( 28 | start_timestamp, 29 | end_timestamp, 30 | str(get_localzone()), 31 | namespace=namespace, 32 | ) 33 | ) 34 | archive_path = krkn_config["telemetry"]["archive_path"] 35 | file_path = archive_path + "/events.json" 36 | with open(file_path, "w+") as f: 37 | f.write("\n".join(str(item) for item in events)) 38 | logging.info(f'Find cluster events in file {file_path}' ) 39 | 40 | 41 | 42 | def collect_and_put_ocp_logs( 43 | telemetry_ocp: KrknTelemetryOpenshift, 44 | scenario_config: dict, 45 | request_id: str, 46 | start_timestamp: int, 47 | end_timestamp: int, 48 | ): 49 | if ( 50 | telemetry_ocp.get_telemetry_config() 51 | and telemetry_ocp.get_telemetry_config()["enabled"] 52 | and telemetry_ocp.get_telemetry_config()["logs_backup"] 53 | and not telemetry_ocp.get_lib_kubernetes().is_kubernetes() 54 | ): 55 | namespaces = __retrieve_namespaces( 56 | scenario_config, telemetry_ocp.get_lib_kubernetes() 57 | ) 58 | if len(namespaces) > 0: 59 | for namespace in namespaces: 60 | telemetry_ocp.put_ocp_logs( 61 | request_id, 62 | telemetry_ocp.get_telemetry_config(), 63 | start_timestamp, 64 | end_timestamp, 65 | namespace, 66 | ) 67 | else: 68 | telemetry_ocp.put_ocp_logs( 69 | request_id, 70 | telemetry_ocp.get_telemetry_config(), 71 | start_timestamp, 72 | end_timestamp, 73 | ) 74 | 75 | 76 | def __retrieve_namespaces(scenario_config: dict, kubecli: KrknKubernetes) -> set[str]: 77 | namespaces = list() 78 | namespaces.extend(krkn_lib.utils.deep_get_attribute("namespace", scenario_config)) 79 | namespace_patterns = krkn_lib.utils.deep_get_attribute( 80 | "namespace_pattern", scenario_config 81 | ) 82 | for pattern in namespace_patterns: 83 | namespaces.extend(kubecli.list_namespaces_by_regex(pattern)) 84 | return set(namespaces) 85 | -------------------------------------------------------------------------------- /media/KrakenStarting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/KrakenStarting.png -------------------------------------------------------------------------------- /media/kraken-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/kraken-workflow.png -------------------------------------------------------------------------------- /media/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/logo.png -------------------------------------------------------------------------------- /rbac/non-privileged-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: krkn-non-privileged-role 5 | namespace: target-namespace 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods", "services"] 9 | verbs: ["get", "list", "watch", "create", "delete"] 10 | - apiGroups: ["apps"] 11 | resources: ["deployments", "statefulsets"] 12 | verbs: ["get", "list", "watch", "create", "delete"] 13 | - apiGroups: ["batch"] 14 | resources: ["jobs"] 15 | verbs: ["get", "list", "watch", "create", "delete"] 16 | -------------------------------------------------------------------------------- /rbac/non-privileged-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: krkn-non-privileged-rolebinding 5 | namespace: target-namespace 6 | subjects: 7 | - kind: ServiceAccount 8 | name: krkn-sa 9 | namespace: target-namespace 10 | roleRef: 11 | kind: Role 12 | name: krkn-non-privileged-role 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /rbac/privileged-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: krkn-privileged-clusterrole 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["nodes"] 8 | verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] 9 | - apiGroups: [""] 10 | resources: ["pods", "services"] 11 | verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] 12 | - apiGroups: ["apps"] 13 | resources: ["deployments", "statefulsets"] 14 | verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] 15 | - apiGroups: ["batch"] 16 | resources: ["jobs"] 17 | verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] 18 | -------------------------------------------------------------------------------- /rbac/privileged-clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: krkn-privileged-clusterrolebinding 5 | subjects: 6 | - kind: ServiceAccount 7 | name: krkn-sa 8 | namespace: krkn-namespace 9 | roleRef: 10 | kind: ClusterRole 11 | name: krkn-privileged-clusterrole 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aliyun-python-sdk-core==2.13.36 2 | aliyun-python-sdk-ecs==4.24.25 3 | arcaflow-plugin-sdk==0.14.0 4 | boto3==1.28.61 5 | azure-identity==1.16.1 6 | azure-keyvault==4.2.0 7 | azure-mgmt-compute==30.5.0 8 | azure-mgmt-network==27.0.0 9 | itsdangerous==2.0.1 10 | coverage==7.6.12 11 | datetime==5.4 12 | docker==7.0.0 13 | gitpython==3.1.41 14 | google-auth==2.37.0 15 | google-cloud-compute==1.22.0 16 | ibm_cloud_sdk_core==3.18.0 17 | ibm_vpc==0.20.0 18 | jinja2==3.1.6 19 | krkn-lib==5.0.1 20 | lxml==5.1.0 21 | kubernetes==28.1.0 22 | numpy==1.26.4 23 | pandas==2.2.0 24 | openshift-client==1.0.21 25 | paramiko==3.4.0 26 | pyVmomi==8.0.2.0.1 27 | pyfiglet==1.0.2 28 | pytest==8.0.0 29 | python-ipmi==0.5.4 30 | python-openstackclient==6.5.0 31 | requests==2.32.2 32 | service_identity==24.1.0 33 | PyYAML==6.0.1 34 | setuptools==78.1.1 35 | werkzeug==3.0.6 36 | wheel==0.42.0 37 | zope.interface==5.4.0 38 | 39 | 40 | git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@v0.1.0 41 | git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0 42 | cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability 43 | -------------------------------------------------------------------------------- /scenarios/kind/node_scenarios_example.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: # node chaos scenarios to be injected 3 | - node_stop_start_scenario 4 | node_name: kind-worker # node on which scenario has to be injected; can set multiple names separated by comma 5 | # label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection 6 | instance_count: 1 # Number of nodes to perform action/select that match the label selector 7 | runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) 8 | timeout: 120 # duration to wait for completion of node scenario injection 9 | cloud_type: docker # cloud type on which Kubernetes/OpenShift runs 10 | - actions: 11 | - node_reboot_scenario 12 | node_name: kind-worker 13 | # label_selector: node-role.kubernetes.io/infra 14 | instance_count: 1 15 | timeout: 120 16 | cloud_type: docker 17 | -------------------------------------------------------------------------------- /scenarios/kind/scheduler.yml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: kill-pods 3 | config: 4 | namespace_pattern: ^kube-system$ 5 | label_selector: component=kube-scheduler 6 | krkn_pod_recovery_time: 120 7 | -------------------------------------------------------------------------------- /scenarios/kube/container_dns.yml: -------------------------------------------------------------------------------- 1 | scenarios: 2 | - name: "kill dns container" 3 | namespace: "kube-system" 4 | label_selector: "k8s-app=kube-dns" 5 | container_name: "" 6 | action: 1 7 | count: 1 8 | retry_wait: 60 9 | -------------------------------------------------------------------------------- /scenarios/kube/cpu-hog.yml: -------------------------------------------------------------------------------- 1 | duration: 60 2 | workers: '' # leave it empty '' node cpu auto-detection 3 | hog-type: cpu 4 | image: quay.io/krkn-chaos/krkn-hog 5 | namespace: default 6 | cpu-load-percentage: 90 7 | cpu-method: all 8 | node-selector: "node-role.kubernetes.io/worker=" 9 | number-of-nodes: 2 10 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"] 11 | -------------------------------------------------------------------------------- /scenarios/kube/io-hog.yml: -------------------------------------------------------------------------------- 1 | duration: 30 2 | workers: '' # leave it empty '' node cpu auto-detection 3 | hog-type: io 4 | image: quay.io/krkn-chaos/krkn-hog 5 | namespace: default 6 | io-block-size: 1m 7 | io-write-bytes: 1g 8 | io-target-pod-folder: /hog-data 9 | io-target-pod-volume: 10 | name: node-volume 11 | hostPath: 12 | path: /root # a path writable by kubelet in the root filesystem of the node 13 | node-selector: "node-role.kubernetes.io/worker=" 14 | number-of-nodes: '' 15 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"] -------------------------------------------------------------------------------- /scenarios/kube/managedcluster_scenarios_example.yml: -------------------------------------------------------------------------------- 1 | managedcluster_scenarios: 2 | - actions: # ManagedCluster chaos scenarios to be injected 3 | - managedcluster_stop_start_scenario 4 | managedcluster_name: cluster1 # ManagedCluster on which scenario has to be injected; can set multiple names separated by comma 5 | # label_selector: # When managedcluster_name is not specified, a ManagedCluster with matching label_selector is selected for ManagedCluster chaos scenario injection 6 | instance_count: 1 # Number of managedcluster to perform action/select that match the label selector 7 | runs: 1 # Number of times to inject each scenario under actions (will perform on same ManagedCluster each time) 8 | timeout: 420 # Duration to wait for completion of ManagedCluster scenario injection 9 | # For OCM to detect a ManagedCluster as unavailable, have to wait 5*leaseDurationSeconds 10 | # (default leaseDurationSeconds = 60 sec) 11 | - actions: 12 | - stop_start_klusterlet_scenario 13 | managedcluster_name: cluster1 14 | # label_selector: 15 | instance_count: 1 16 | runs: 1 17 | timeout: 60 -------------------------------------------------------------------------------- /scenarios/kube/memory-hog.yml: -------------------------------------------------------------------------------- 1 | duration: 60 2 | workers: '' # leave it empty '' node cpu auto-detection 3 | hog-type: memory 4 | image: quay.io/krkn-chaos/krkn-hog 5 | namespace: default 6 | memory-vm-bytes: 90% 7 | node-selector: "node-role.kubernetes.io/worker=" 8 | number-of-nodes: '' 9 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"] 10 | -------------------------------------------------------------------------------- /scenarios/kube/network-filter.yml: -------------------------------------------------------------------------------- 1 | - id: node_network_filter 2 | wait_duration: 300 3 | test_duration: 100 4 | label_selector: "kubernetes.io/hostname=ip-10-0-39-182.us-east-2.compute.internal" 5 | namespace: 'default' 6 | instance_count: 1 7 | execution: parallel 8 | ingress: false 9 | egress: true 10 | target: node 11 | interfaces: [] 12 | ports: 13 | - 2049 -------------------------------------------------------------------------------- /scenarios/kube/pod.yml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: kill-pods 3 | config: 4 | name_pattern: ^nginx-.*$ 5 | namespace_pattern: ^default$ 6 | kill: 1 7 | krkn_pod_recovery_time: 120 8 | -------------------------------------------------------------------------------- /scenarios/kube/scheduler.yml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: kill-pods 3 | config: 4 | namespace_pattern: ^kube-system$ 5 | label_selector: k8s-app=kube-scheduler 6 | krkn_pod_recovery_time: 120 7 | -------------------------------------------------------------------------------- /scenarios/kube/service_hijacking.yaml: -------------------------------------------------------------------------------- 1 | # refer to the documentation for further infos https://github.com/krkn-chaos/krkn/blob/main/docs/service_hijacking.md 2 | 3 | service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration). 4 | service_name: nginx-service # name of the service to be hijacked 5 | service_namespace: default # The namespace where the target service is located 6 | image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic. 7 | chaos_duration: 30 # Total duration of the chaos scenario in seconds. 8 | plan: 9 | - resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored. 10 | # For resources, only query parameters are captured. 11 | 12 | steps: # A time-based plan consisting of steps can be defined for each resource. 13 | GET: # One or more HTTP methods can be specified for each step. 14 | # Note: Non-standard methods are supported 15 | # for fully custom web services (e.g., using NONEXISTENT instead of POST). 16 | 17 | - duration: 15 # Duration in seconds for this step before moving to the next one, if defined. Otherwise, 18 | # this step will continue until the chaos scenario ends. 19 | 20 | status: 500 # HTTP status code to be returned in this step. 21 | mime_type: "application/json" # MIME type of the response for this step. 22 | payload: | # The response payload for this step. 23 | { 24 | "status":"internal server error" 25 | } 26 | - duration: 15 27 | status: 201 28 | mime_type: "application/json" 29 | payload: | 30 | { 31 | "status":"resource created" 32 | } 33 | POST: 34 | - duration: 15 35 | status: 401 36 | mime_type: "application/json" 37 | payload: | 38 | { 39 | "status": "unauthorized" 40 | } 41 | - duration: 15 42 | status: 404 43 | mime_type: "text/plain" 44 | payload: "not found" 45 | 46 | - resource: "/patch" 47 | steps: 48 | PATCH: 49 | - duration: 15 50 | status: 201 51 | mime_type: "text/plain" 52 | payload: "resource patched" 53 | - duration: 15 54 | status: 400 55 | mime_type: "text/plain" 56 | payload: "bad request" -------------------------------------------------------------------------------- /scenarios/kube/syn_flood.yaml: -------------------------------------------------------------------------------- 1 | packet-size: 120 # hping3 packet size 2 | window-size: 64 # hping 3 TCP window size 3 | duration: 10 # chaos scenario duration 4 | namespace: default # namespace where the target service(s) are deployed 5 | target-service: elasticsearch # target service name (if set target-service-label must be empty) 6 | target-port: 9200 # target service TCP port 7 | target-service-label : "" # target service label, can be used to target multiple target at the same time 8 | # if they have the same label set (if set target-service must be empty) 9 | number-of-pods: 2 # number of attacker pod instantiated per each target 10 | image: quay.io/krkn-chaos/krkn-syn-flood:v1.0.0 # syn flood attacker container image 11 | attacker-nodes: # this will set the node affinity to schedule the attacker node. Per each node label selector 12 | node-role.kubernetes.io/worker: # can be specified multiple values in this way the kube scheduler will schedule the attacker pods 13 | - "" # in the best way possible based on the provided labels. Multiple labels can be specified 14 | # set empty value `attacker-nodes: {}` to let kubernetes schedule the pods 15 | 16 | 17 | -------------------------------------------------------------------------------- /scenarios/openshift/app_outage.yaml: -------------------------------------------------------------------------------- 1 | application_outage: # Scenario to create an outage of an application by blocking traffic 2 | duration: 600 # Duration in seconds after which the routes will be accessible 3 | namespace: # Namespace to target - all application routes will go inaccessible if pod selector is empty 4 | pod_selector: {app: foo} # Pods to target 5 | block: [Ingress, Egress] # It can be Ingress or Egress or Ingress, Egress 6 | -------------------------------------------------------------------------------- /scenarios/openshift/aws_node_scenarios.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: # node chaos scenarios to be injected 3 | - node_stop_start_scenario 4 | node_name: # node on which scenario has to be injected; can set multiple names separated by comma 5 | label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection; can specify multiple by a comma separated list 6 | instance_count: 2 # Number of nodes to perform action/select that match the label selector 7 | runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) 8 | timeout: 360 # duration to wait for completion of node scenario injection 9 | duration: 20 # duration to stop the node before running the start action 10 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs 11 | parallel: true # Run action on label or node name in parallel or sequential, defaults to sequential 12 | - actions: 13 | - node_reboot_scenario 14 | node_name: 15 | label_selector: node-role.kubernetes.io/infra 16 | instance_count: 1 17 | timeout: 120 18 | cloud_type: aws 19 | - actions: 20 | - node_disk_detach_attach_scenario 21 | node_name: 22 | label_selector: 23 | instance_count: 1 24 | timeout: 120 25 | cloud_type: aws -------------------------------------------------------------------------------- /scenarios/openshift/azure_node_scenarios.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: 3 | - node_reboot_scenario 4 | node_name: 5 | label_selector: node-role.kubernetes.io/infra 6 | instance_count: 1 7 | timeout: 120 8 | cloud_type: azure 9 | - actions: 10 | - node_stop_start_scenario 11 | node_name: 12 | label_selector: node-role.kubernetes.io/infra 13 | instance_count: 1 14 | timeout: 360 15 | duration: 120 16 | cloud_type: azure 17 | -------------------------------------------------------------------------------- /scenarios/openshift/baremetal_node_scenarios.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: # Node chaos scenarios to be injected. 3 | - node_stop_start_scenario 4 | node_name: # Node on which scenario has to be injected. 5 | label_selector: node-role.kubernetes.io/worker # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection. 6 | instance_count: 1 # Number of nodes to perform action/select that match the label selector. 7 | runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time). 8 | timeout: 360 # Duration to wait for completion of node scenario injection. 9 | duration: 120 # Duration to stop the node before running the start action 10 | cloud_type: bm # Cloud type on which Kubernetes/OpenShift runs. 11 | bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines. 12 | bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines. 13 | bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info. 14 | node-1: # The node name for the baremetal machine 15 | bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh'. 16 | node-2: 17 | bmc_addr: mgmt-machine2.example.com 18 | bmc_user: user # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set. 19 | bmc_password: pass # The baremetal IPMI password. Overrides the default IPMI user specified above. Optional if the default is set 20 | -------------------------------------------------------------------------------- /scenarios/openshift/cluster_shut_down_scenario.yml: -------------------------------------------------------------------------------- 1 | cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes 2 | runs: 1 # Number of times to execute the cluster_shut_down scenario 3 | shut_down_duration: 150 # duration in seconds to shut down the cluster 4 | cloud_type: aws # cloud type on which Kubernetes/OpenShift runs 5 | timeout: 60 # Number of seconds to wait for each node to be stopped or running 6 | -------------------------------------------------------------------------------- /scenarios/openshift/container_etcd.yml: -------------------------------------------------------------------------------- 1 | scenarios: 2 | - name: "kill etcd container" 3 | namespace: "openshift-etcd" 4 | label_selector: "k8s-app=etcd" 5 | container_name: "etcd" 6 | action: 1 7 | count: 1 8 | expected_recovery_time: 120 9 | -------------------------------------------------------------------------------- /scenarios/openshift/customapp_pod.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: kill-pods 3 | config: 4 | namespace_pattern: ^acme-air$ 5 | name_pattern: .* 6 | krkn_pod_recovery_time: 120 -------------------------------------------------------------------------------- /scenarios/openshift/etcd.yml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: kill-pods 3 | config: 4 | namespace_pattern: ^openshift-etcd$ 5 | label_selector: k8s-app=etcd 6 | krkn_pod_recovery_time: 120 7 | -------------------------------------------------------------------------------- /scenarios/openshift/gcp_node_scenarios.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: 3 | - node_reboot_scenario 4 | node_name: 5 | label_selector: node-role.kubernetes.io/worker 6 | instance_count: 1 7 | timeout: 120 8 | cloud_type: gcp 9 | - actions: 10 | - node_stop_start_scenario 11 | node_name: 12 | label_selector: node-role.kubernetes.io/worker 13 | instance_count: 1 14 | timeout: 360 15 | duration: 120 16 | cloud_type: gcp 17 | -------------------------------------------------------------------------------- /scenarios/openshift/ibmcloud_node_scenarios.yml: -------------------------------------------------------------------------------- 1 | node_scenarios: 2 | - actions: 3 | - node_stop_start_scenario 4 | node_name: 5 | label_selector: node-role.kubernetes.io/worker 6 | instance_count: 1 7 | timeout: 360 8 | duration: 120 9 | cloud_type: ibm 10 | - actions: 11 | - node_reboot_scenario 12 | node_name: 13 | label_selector: node-role.kubernetes.io/worker 14 | instance_count: 1 15 | timeout: 120 16 | cloud_type: ibm -------------------------------------------------------------------------------- /scenarios/openshift/ingress_namespace.yaml: -------------------------------------------------------------------------------- 1 | scenarios: 2 | - namespace: "^.*ingress.*$" 3 | runs: 1 4 | sleep: 15 5 | wait_time: 300 6 | -------------------------------------------------------------------------------- /scenarios/openshift/network_chaos.yaml: -------------------------------------------------------------------------------- 1 | network_chaos: # Scenario to create an outage by simulating random variations in the network. 2 | duration: 300 # seconds 3 | node_name: # node on which scenario has to be injected; 4 | label_selector: # when node_name is not specified, a node with matching label_selector is selected for running the scenario. 5 | instance_count: 1 6 | interfaces: # Interface name would be the Kernel host network interface name. 7 | - "" 8 | execution: serial 9 | egress: 10 | latency: 50ms # 50ms 11 | loss: 0.02 # percentage 12 | -------------------------------------------------------------------------------- /scenarios/openshift/network_chaos_ingress.yml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../plugin.schema.json 2 | - id: network_chaos 3 | config: 4 | node_interface_name: # Dictionary with key as node name(s) and value as a list of its interfaces to test 5 | : 6 | - 7 | label_selector: # When node_interface_name is not specified, nodes with matching label_selector is selected for node chaos scenario injection 8 | instance_count: # Number of nodes to perform action/select that match the label selector 9 | kubeconfig_path: # Path to kubernetes config file. If not specified, it defaults to ~/.kube/config 10 | execution_type: # Used to specify whether you want to apply filters on interfaces one at a time or all at once. Default is 'parallel' 11 | network_params: # latency, loss and bandwidth are the three supported network parameters to alter for the chaos test 12 | latency: