├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── release-template.md
    ├── release.yml
    └── workflows
    │   ├── docker-image.yml
    │   ├── release.yml
    │   ├── require-docs.yml
    │   └── tests.yml
├── .gitignore
├── .gitleaks.toml
├── ADOPTERS.md
├── CI
    ├── README.md
    ├── config
    │   └── common_test_config.yaml
    ├── legacy
    │   ├── scenarios
    │   │   ├── cluster_shut_down_scenario.yml
    │   │   ├── node_scenario.yml
    │   │   ├── volume_scenario.yaml
    │   │   ├── zone_outage.yaml
    │   │   └── zone_outage_env.yaml
    │   └── tests
    │   │   ├── test_nodes.sh
    │   │   ├── test_shut_down.sh
    │   │   └── test_zone.sh
    ├── run.sh
    ├── run_test.sh
    ├── templates
    │   ├── container_scenario_pod.yaml
    │   ├── outage_pod.yaml
    │   ├── service_hijacking.yaml
    │   └── time_pod.yaml
    └── tests
    │   ├── common.sh
    │   ├── functional_tests
    │   ├── test_app_outages.sh
    │   ├── test_container.sh
    │   ├── test_cpu_hog.sh
    │   ├── test_io_hog.sh
    │   ├── test_memory_hog.sh
    │   ├── test_namespace.sh
    │   ├── test_net_chaos.sh
    │   ├── test_service_hijacking.sh
    │   ├── test_telemetry.sh
    │   └── test_time.sh
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MAINTAINERS.md
├── README.md
├── ROADMAP.md
├── SECURITY.md
├── ansible
    ├── ansible.cfg
    ├── inventory
    ├── kraken.yml
    ├── templates
    │   └── kraken.j2
    └── vars
    │   └── kraken_vars.yml
├── config
    ├── alerts.yaml
    ├── alerts_openshift.yaml
    ├── cerberus.yaml
    ├── config.yaml
    ├── config_kind.yaml
    ├── config_kubernetes.yaml
    ├── config_performance.yaml
    ├── metrics-aggregated.yaml
    ├── metrics-report.yaml
    ├── metrics.yaml
    └── recommender_config.yaml
├── containers
    ├── Dockerfile.template
    ├── README.md
    ├── build_own_image-README.md
    ├── compile_dockerfile.sh
    └── krknctl-input.json
├── kind-config.yml
├── krkn
    ├── __init__.py
    ├── cerberus
    │   ├── __init__.py
    │   └── setup.py
    ├── chaos_recommender
    │   ├── __init__.py
    │   ├── analysis.py
    │   ├── kraken_tests.py
    │   └── prometheus.py
    ├── invoke
    │   ├── __init__.py
    │   └── command.py
    ├── performance_dashboards
    │   ├── __init__.py
    │   └── setup.py
    ├── prometheus
    │   ├── __init__.py
    │   └── client.py
    ├── scenario_plugins
    │   ├── __init__.py
    │   ├── abstract_scenario_plugin.py
    │   ├── application_outage
    │   │   ├── __init__.py
    │   │   └── application_outage_scenario_plugin.py
    │   ├── container
    │   │   ├── __init__.py
    │   │   └── container_scenario_plugin.py
    │   ├── hogs
    │   │   ├── __init__.py
    │   │   └── hogs_scenario_plugin.py
    │   ├── managed_cluster
    │   │   ├── __init__.py
    │   │   ├── common_functions.py
    │   │   ├── managed_cluster_scenario_plugin.py
    │   │   └── scenarios.py
    │   ├── native
    │   │   ├── __init__.py
    │   │   ├── native_scenario_plugin.py
    │   │   ├── network
    │   │   │   ├── cerberus.py
    │   │   │   ├── ingress_shaping.py
    │   │   │   ├── job.j2
    │   │   │   ├── kubernetes_functions.py
    │   │   │   ├── pod_interface.j2
    │   │   │   └── pod_module.j2
    │   │   ├── plugins.py
    │   │   ├── pod_network_outage
    │   │   │   ├── cerberus.py
    │   │   │   ├── job.j2
    │   │   │   ├── kubernetes_functions.py
    │   │   │   ├── pod_module.j2
    │   │   │   └── pod_network_outage_plugin.py
    │   │   └── run_python_plugin.py
    │   ├── network_chaos
    │   │   ├── __init__.py
    │   │   ├── job.j2
    │   │   ├── network_chaos_scenario_plugin.py
    │   │   └── pod.j2
    │   ├── network_chaos_ng
    │   │   ├── __init__.py
    │   │   ├── models.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_network_chaos_module.py
    │   │   │   ├── node_network_filter.py
    │   │   │   └── templates
    │   │   │   │   └── network-chaos.j2
    │   │   ├── network_chaos_factory.py
    │   │   └── network_chaos_ng_scenario_plugin.py
    │   ├── node_actions
    │   │   ├── __init__.py
    │   │   ├── abstract_node_scenarios.py
    │   │   ├── alibaba_node_scenarios.py
    │   │   ├── aws_node_scenarios.py
    │   │   ├── az_node_scenarios.py
    │   │   ├── bm_node_scenarios.py
    │   │   ├── common_node_functions.py
    │   │   ├── docker_node_scenarios.py
    │   │   ├── gcp_node_scenarios.py
    │   │   ├── general_cloud_node_scenarios.py
    │   │   ├── ibmcloud_node_scenarios.py
    │   │   ├── node_actions_scenario_plugin.py
    │   │   ├── openstack_node_scenarios.py
    │   │   └── vmware_node_scenarios.py
    │   ├── pvc
    │   │   ├── __init__.py
    │   │   └── pvc_scenario_plugin.py
    │   ├── scenario_plugin_factory.py
    │   ├── service_disruption
    │   │   ├── __init__.py
    │   │   └── service_disruption_scenario_plugin.py
    │   ├── service_hijacking
    │   │   ├── __init__.py
    │   │   └── service_hijacking_scenario_plugin.py
    │   ├── shut_down
    │   │   ├── __init__.py
    │   │   └── shut_down_scenario_plugin.py
    │   ├── syn_flood
    │   │   ├── __init__.py
    │   │   └── syn_flood_scenario_plugin.py
    │   ├── time_actions
    │   │   ├── __init__.py
    │   │   └── time_actions_scenario_plugin.py
    │   └── zone_outage
    │   │   ├── __init__.py
    │   │   └── zone_outage_scenario_plugin.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_classes
    │   │   ├── __init__.py
    │   │   ├── correct_scenario_plugin.py
    │   │   ├── duplicated_scenario_plugin.py
    │   │   ├── duplicated_two_scenario_plugin.py
    │   │   ├── example_scenario_plugin.py
    │   │   ├── snake_case_mismatch_scenario_plugin.py
    │   │   ├── wrong_classname_scenario_plugin.py
    │   │   └── wrong_module.py
    │   └── test_plugin_factory.py
    └── utils
    │   ├── HealthChecker.py
    │   ├── TeeLogHandler.py
    │   ├── __init__.py
    │   └── functions.py
├── media
    ├── KrakenStarting.png
    ├── kraken-workflow.png
    └── logo.png
├── rbac
    ├── non-privileged-role.yaml
    ├── non-privileged-rolebinding.yaml
    ├── privileged-clusterrole.yaml
    └── privileged-clusterrolebinding.yaml
├── requirements.txt
├── run_kraken.py
├── scenarios
    ├── kind
    │   ├── node_scenarios_example.yml
    │   └── scheduler.yml
    ├── kube
    │   ├── container_dns.yml
    │   ├── cpu-hog.yml
    │   ├── io-hog.yml
    │   ├── managedcluster_scenarios_example.yml
    │   ├── memory-hog.yml
    │   ├── network-filter.yml
    │   ├── pod.yml
    │   ├── scheduler.yml
    │   ├── service_hijacking.yaml
    │   └── syn_flood.yaml
    ├── openshift
    │   ├── app_outage.yaml
    │   ├── aws_node_scenarios.yml
    │   ├── azure_node_scenarios.yml
    │   ├── baremetal_node_scenarios.yml
    │   ├── cluster_shut_down_scenario.yml
    │   ├── container_etcd.yml
    │   ├── customapp_pod.yaml
    │   ├── etcd.yml
    │   ├── gcp_node_scenarios.yml
    │   ├── ibmcloud_node_scenarios.yml
    │   ├── ingress_namespace.yaml
    │   ├── network_chaos.yaml
    │   ├── network_chaos_ingress.yml
    │   ├── openshift-apiserver.yml
    │   ├── openshift-kube-apiserver.yml
    │   ├── pod_egress_shaping.yml
    │   ├── pod_ingress_shaping.yml
    │   ├── pod_network_outage.yml
    │   ├── prom_kill.yml
    │   ├── prometheus.yml
    │   ├── pvc_scenario.yaml
    │   ├── regex_namespace.yaml
    │   ├── regex_openshift_pod_kill.yml
    │   ├── time_scenarios_example.yml
    │   ├── vmware_node_scenarios.yml
    │   ├── zone_outage.yaml
    │   └── zone_outage_gcp.yaml
    ├── plugin.schema.README.md
    └── plugin.schema.json
├── server.py
├── setup.cfg
├── setup.py
├── tests
    ├── test_ingress_network_plugin.py
    └── test_run_python_plugin.py
└── utils
    ├── arcaflow
        └── ocp-chaos
        │   ├── README.md
        │   ├── config.yaml
        │   ├── input.yaml
        │   ├── subworkflows
        │       ├── cpu-hog.yaml
        │       ├── kubeburner.yaml
        │       └── pod-chaos.yaml
        │   └── workflow.yaml
    ├── chaos_ai
        ├── README.md
        ├── config
        │   └── experiments
        │   │   └── .gitkeep
        ├── docker
        │   ├── Dockerfile
        │   ├── aichaos-config.json
        │   ├── config
        │   │   ├── experiments
        │   │   │   └── log.yml
        │   │   ├── pod-delete.json
        │   │   └── yml
        │   │   │   ├── chaosGen.yml
        │   │   │   ├── episodes.yml
        │   │   │   ├── log.yml
        │   │   │   ├── qtable.yml
        │   │   │   └── status.yml
        │   ├── requirements.txt
        │   └── swagger_api.py
        ├── generate_wheel_package.py
        ├── requirements.txt
        └── src
        │   ├── __init__.py
        │   ├── aichaos.py
        │   ├── aichaos_main.py
        │   ├── experiments.py
        │   ├── kraken_utils.py
        │   ├── qlearning.py
        │   ├── swagger_api.py
        │   ├── test_application.py
        │   └── utils.py
    └── chaos_recommender
        ├── README.md
        ├── chaos_recommender.py
        └── recommender_config.yaml


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description  
 2 | <!-- Provide a brief description of the changes made in this PR. -->  
 3 | 
 4 | ## Documentation  
 5 | - [ ] **Is documentation needed for this update?**
 6 | 
 7 | If checked, a documentation PR must be created and merged in the [website repository](https://github.com/krkn-chaos/website/).
 8 | 
 9 | ## Related Documentation PR (if applicable)  
10 | <!-- Add the link to the corresponding documentation PR in the website repository -->  


--------------------------------------------------------------------------------
/.github/release-template.md:
--------------------------------------------------------------------------------
1 | ## Release {VERSION}
2 | 
3 | ### Download Artifacts
4 | - 📦 Krkn sources (noarch): [krkn-{VERSION}-src.tar.gz](https://krkn-chaos.gateway.scarf.sh/krkn-src-{VERSION}.tar.gz)
5 | 
6 | ### Changes
7 | {CHANGES}
8 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/.github/release.yml


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | on:
 3 |   push:
 4 |     tags: ['v[0-9].[0-9]+.[0-9]+']
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Check out code
12 |       uses: actions/checkout@v3
13 |     - name: Build the Docker images
14 |       if: startsWith(github.ref, 'refs/tags')
15 |       run:  |
16 |         ./containers/compile_dockerfile.sh
17 |         docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg TAG=${GITHUB_REF#refs/tags/}
18 |         docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn
19 |         docker tag quay.io/krkn-chaos/krkn quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
20 |         docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/}
21 | 
22 |     - name: Test Build the Docker images
23 |       if: ${{ github.event_name == 'pull_request' }}
24 |       run: |
25 |         ./containers/compile_dockerfile.sh
26 |         docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg PR_NUMBER=${{ github.event.pull_request.number }}
27 |     - name: Login in quay
28 |       if: startsWith(github.ref, 'refs/tags')
29 |       run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
30 |       env:
31 |         QUAY_USER: ${{ secrets.QUAY_USERNAME }}
32 |         QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
33 |     - name: Push the KrknChaos Docker images
34 |       if: startsWith(github.ref, 'refs/tags')
35 |       run: |
36 |         docker push quay.io/krkn-chaos/krkn
37 |         docker push quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
38 |     - name: Login in to redhat-chaos quay
39 |       if: startsWith(github.ref, 'refs/tags/v')
40 |       run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
41 |       env:
42 |         QUAY_USER: ${{ secrets.QUAY_USER_1 }}
43 |         QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
44 |     - name: Push the RedHat Chaos Docker images
45 |       if: startsWith(github.ref, 'refs/tags')
46 |       run: | 
47 |         docker push quay.io/redhat-chaos/krkn
48 |         docker push quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/}
49 |     - name: Rebuild krkn-hub
50 |       if: startsWith(github.ref, 'refs/tags')
51 |       uses: redhat-chaos/actions/krkn-hub@main
52 |       with:
53 |         QUAY_USER: ${{ secrets.QUAY_USERNAME }}
54 |         QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
55 |         AUTOPUSH: ${{ secrets.AUTOPUSH }}
56 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Create Release
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - 'v*'
 6 | jobs:
 7 |   release:
 8 |     permissions:
 9 |       contents: write
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - name: calculate previous tag
14 |         run: |
15 |           git fetch --tags origin
16 |           PREVIOUS_TAG=$(git tag --sort=-creatordate | sed -n '2 p')
17 |           echo $PREVIOUS_TAG 
18 |           echo "PREVIOUS_TAG=$PREVIOUS_TAG" >> "$GITHUB_ENV"
19 |       - name: generate release notes from template
20 |         id: release-notes
21 |         env:
22 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23 |         run: |
24 |           NOTES=$(gh api \
25 |           --method POST \
26 |           -H "Accept: application/vnd.github+json" \
27 |           -H "X-GitHub-Api-Version: 2022-11-28" \
28 |           /repos/krkn-chaos/krkn/releases/generate-notes \
29 |           -f "tag_name=${{ github.ref_name }}" -f "target_commitish=main" -f "previous_tag_name=${{ env.PREVIOUS_TAG }}" | jq -r .body)
30 |           echo "NOTES<<EOF" >> $GITHUB_ENV
31 |           echo "$NOTES" >> $GITHUB_ENV
32 |           echo "EOF" >> $GITHUB_ENV
33 | 
34 |       - name: replace placeholders in template
35 |         run: |
36 |           echo "${{ env.NOTES }}"
37 |           TEMPLATE=$(cat .github/release-template.md)
38 |           VERSION=${{ github.ref_name }}
39 |           NOTES="${{ env.NOTES }}"
40 |           OUTPUT=${TEMPLATE//\{VERSION\}/$VERSION}
41 |           OUTPUT=${OUTPUT//\{CHANGES\}/$NOTES}
42 |           echo "$OUTPUT" > release-notes.md
43 |       - name: create release
44 |         env:
45 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
46 |         run: |
47 |           gh release create ${{ github.ref_name }} --title "${{ github.ref_name }}"  -F release-notes.md
48 | 


--------------------------------------------------------------------------------
/.github/workflows/require-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Require Documentation Update
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, edited, synchronize]
 5 |     branches:
 6 |         - main
 7 | jobs:
 8 |   check-docs:
 9 |     name: Check Documentation Update
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout repository
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Check if Documentation is Required
16 |         id: check_docs
17 |         run: |
18 |           echo "Checking PR body for documentation checkbox..."
19 |           # Read the PR body from the GitHub event payload
20 |           if echo "${{ github.event.pull_request.body }}" | grep -qi '\[x\].*documentation needed'; then
21 |             echo "Documentation required detected."
22 |             echo "docs_required=true" >> $GITHUB_OUTPUT
23 |           else
24 |             echo "Documentation not required."
25 |             echo "docs_required=false" >> $GITHUB_OUTPUT
26 |           fi
27 | 
28 |       - name: Enforce Documentation Update (if required)
29 |         if: steps.check_docs.outputs.docs_required == 'true'
30 |         env:
31 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
32 |         run: |
33 |           # Retrieve feature branch and repository owner from the GitHub context
34 |           FEATURE_BRANCH="${{ github.head_ref }}"
35 |           REPO_OWNER="${{ github.repository_owner }}"
36 |           WEBSITE_REPO="website"
37 |           echo "Searching for a merged documentation PR for feature branch: $FEATURE_BRANCH in $REPO_OWNER/$WEBSITE_REPO..."
38 |           MERGED_PR=$(gh pr list --repo "$REPO_OWNER/$WEBSITE_REPO" --state merged --json headRefName,title,url | jq -r \
39 |             --arg FEATURE_BRANCH "$FEATURE_BRANCH" '.[] | select(.title | contains($FEATURE_BRANCH)) | .url')
40 |           if [[ -z "$MERGED_PR" ]]; then
41 |             echo ":x: Documentation PR for branch '$FEATURE_BRANCH' is required and has not been merged."
42 |             exit 1
43 |           else
44 |             echo ":white_check_mark: Found merged documentation PR: $MERGED_PR"
45 |           fi


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary and binary files
 2 | *~
 3 | *.py[cod]
 4 | *.so
 5 | *.cfg
 6 | !.isort.cfg
 7 | !setup.cfg
 8 | *.orig
 9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | .DS_Store
16 | *.out
17 | kube-burner*
18 | kube_burner*
19 | recommender_*.json
20 | 
21 | # Project files
22 | .ropeproject
23 | .project
24 | .pydevproject
25 | .settings
26 | .idea
27 | .vscode
28 | config/debug.yaml
29 | tags
30 | 
31 | # Package files
32 | *.egg
33 | *.eggs/
34 | .installed.cfg
35 | *.egg-info
36 | 
37 | # Unittest and coverage
38 | htmlcov/*
39 | .coverage
40 | junit.xml
41 | coverage.xml
42 | .pytest_cache/
43 | 
44 | # Build and docs folder/files
45 | build/*
46 | dist/*
47 | sdist/*
48 | docs/api/*
49 | docs/_rst/*
50 | docs/_build/*
51 | cover/*
52 | MANIFEST
53 | 
54 | # Per-project virtualenvs
55 | .venv*/
56 | venv*/
57 | kraken.report
58 | collected-metrics/*
59 | inspect.local.*
60 | 
61 | # Tests
62 | !CI/config/common_test_config.yaml
63 | CI/out/*
64 | CI/ci_results
65 | CI/legacy/*node.yaml
66 | CI/results.markdown
67 | 
68 | #env
69 | chaos/*
70 | 
71 | 


--------------------------------------------------------------------------------
/.gitleaks.toml:
--------------------------------------------------------------------------------
1 | [allowlist]
2 |   description = "Global Allowlist"
3 | 
4 |   paths = [
5 |         '''kraken/arcaflow_plugin/fixtures/*'''
6 |   ]
7 | 


--------------------------------------------------------------------------------
/ADOPTERS.md:
--------------------------------------------------------------------------------
1 | # Krkn Adopters
2 | 
3 | This is a list of organizations that have publicly acknowledged usage of Krkn and shared details of how they are leveraging it in their environment for chaos engineering use cases. Do you want to add yourself to this list? Please fork the repository and open a PR with the required change. 
4 | 
5 | | Organization | Since | Website | Use-Case |
6 | |:-|:-|:-|:-|
7 | | MarketAxess | 2024 | https://www.marketaxess.com/ | Kraken enables us to achieve our goal of increasing the reliability of our cloud products on Kubernetes. The tool allows us to automatically run various chaos scenarios, identify resilience and performance bottlenecks, and seamlessly restore the system to its original state once scenarios finish. These chaos scenarios include pod disruptions, node (EC2) outages, simulating availability zone (AZ) outages, and filling up storage spaces like EBS and EFS. The community is highly responsive to requests and works on expanding the tool's capabilities. MarketAxess actively contributes to the project, adding features such as the ability to leverage existing network ACLs and proposing several feature improvements to enhance test coverage. |
8 | | Red Hat Openshift | 2020 | https://www.redhat.com/ | Kraken is a highly reliable chaos testing tool used to ensure the quality and resiliency of Red Hat Openshift. The engineering team runs all the test scenarios under Kraken on different cloud platforms on both self-managed and cloud services environments prior to the release of a new version of the product. The team also contributes to the Kraken project consistently which helps the test scenarios to keep up with the new features introduced to the product. Inclusion of this test coverage has contributed to gaining the trust of new and existing customers of the product.   |
9 | 


--------------------------------------------------------------------------------
/CI/README.md:
--------------------------------------------------------------------------------
 1 | ## CI Tests
 2 | 
 3 | ### First steps
 4 | Edit [functional_tests](tests/functional_tests) with tests you want to run
 5 | 
 6 | ### How to run
 7 | ```./CI/run.sh```
 8 | 
 9 | This will run kraken using python, make sure python3 is set up and configured properly with all requirements
10 | 
11 | 
12 | ### Adding a test case
13 | 
14 | 1. Add in simple scenario yaml file to execute under [../CI/scenarios/](legacy)
15 | 
16 | 2. Copy [test_application_outages.sh](tests/test_app_outages.sh) for example on how to get started
17 | 
18 | 3. Lines to change for bash script
19 | 
20 |     a. 11: Set scenario type to be your new scenario name
21 | 
22 |     b. 12: Add pointer to scenario file for the test
23 | 
24 |     c. 13: If a post action file is needed; add in pointer
25 | 
26 |     d. 14: Set filled in config yaml file name specific to your scenario
27 | 
28 |     e. 15: Make sure name of config in line 14 matches what you pass on this line
29 | 
30 | 4. Add test name to [functional_tests](../CI/tests/functional_tests) file
31 | 
32 |     a. This will be the name of the file without ".sh"
33 | 
34 | 5. If any changes to the main config (other than the scenario list), please be sure to add them into the [common_config](config/common_test_config.yaml)
35 | 


--------------------------------------------------------------------------------
/CI/legacy/scenarios/cluster_shut_down_scenario.yml:
--------------------------------------------------------------------------------
1 | cluster_shut_down_scenario:                          # Scenario to stop all the nodes for specified duration and restart the nodes
2 |   runs: 1                                            # Number of times to execute the cluster_shut_down scenario
3 |   shut_down_duration: 10                            # duration in seconds to shut down the cluster
4 |   cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs
5 |   timeout: 60                                        # Number of seconds to wait for each node to be stopped or running
6 | 


--------------------------------------------------------------------------------
/CI/legacy/scenarios/node_scenario.yml:
--------------------------------------------------------------------------------
1 | node_scenarios:
2 | - actions:                                                        # node chaos scenarios to be injected
3 |   - node_reboot_scenario
4 |   node_name:                                                      # node on which scenario has to be injected
5 |   label_selector: node-role.kubernetes.io/worker                  # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
6 |   instance_kill_count: 1                                          # number of times to inject each scenario under actions
7 |   timeout: 80                                                    # duration to wait for completion of node scenario injection
8 |   cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs
9 | 


--------------------------------------------------------------------------------
/CI/legacy/scenarios/volume_scenario.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   labels:
 5 |     kubernetes.io/metadata.name: kraken
 6 |     pod-security.kubernetes.io/audit: privileged
 7 |     pod-security.kubernetes.io/enforce: privileged
 8 |     pod-security.kubernetes.io/enforce-version: v1.24
 9 |     pod-security.kubernetes.io/warn: privileged
10 |     security.openshift.io/scc.podSecurityLabelSync: "false"
11 |   name: kraken
12 | ---
13 | apiVersion: v1
14 | kind: PersistentVolume
15 | metadata:
16 |   name: kraken-test-pv
17 |   namespace: kraken
18 |   labels:
19 |     type: local
20 | spec:
21 |   storageClassName: manual
22 |   capacity:
23 |     storage: 2Gi
24 |   accessModes:
25 |     - ReadWriteOnce
26 |   hostPath:
27 |     path: "/mnt/data"
28 | ---
29 | apiVersion: v1
30 | kind: PersistentVolumeClaim
31 | metadata:
32 |   name: kraken-test-pvc
33 |   namespace: kraken
34 | spec:
35 |   storageClassName: manual
36 |   accessModes:
37 |     - ReadWriteOnce
38 |   resources:
39 |     requests:
40 |       storage: 1Gi
41 | ---
42 | apiVersion: v1
43 | kind: Pod
44 | metadata:
45 |   name: kraken-test-pod
46 |   namespace: kraken
47 | spec:
48 |   volumes:
49 |     - name: kraken-test-pv
50 |       persistentVolumeClaim:
51 |         claimName: kraken-test-pvc
52 |   containers:
53 |     - name: kraken-test-container
54 |       image: 'quay.io/centos7/httpd-24-centos7:latest'
55 |       volumeMounts:
56 |         - mountPath: "/home/krake-dir/"
57 |           name: kraken-test-pv
58 |       securityContext:
59 |         privileged: true
60 | 


--------------------------------------------------------------------------------
/CI/legacy/scenarios/zone_outage.yaml:
--------------------------------------------------------------------------------
1 | zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
2 |   cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
3 |   duration: 10                                      # duration in seconds after which the zone will be back online
4 |   vpc_id: $VPC_ID                                           # cluster virtual private network to target
5 |   subnet_id: $SUBNET_ID                    # List of subnet-id's to deny both ingress and egress traffic
6 | 


--------------------------------------------------------------------------------
/CI/legacy/scenarios/zone_outage_env.yaml:
--------------------------------------------------------------------------------
1 | zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
2 |   cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
3 |   duration: 10                                      # duration in seconds after which the zone will be back online
4 |   vpc_id: vpc-0b43122e2d2ee058f                                           # cluster virtual private network to target
5 |   subnet_id: [subnet-088c73e73587d8aba]                    # List of subnet-id's to deny both ingress and egress traffic
6 | 


--------------------------------------------------------------------------------
/CI/legacy/tests/test_nodes.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function funtional_test_node_crash {
10 | 
11 |   export scenario_type="node_scenarios"
12 |   export scenario_file="CI/scenarios/node_scenario.yml"
13 |   export post_config=""
14 |   envsubst < CI/config/common_test_config.yaml > CI/config/node_config.yaml
15 | 
16 |   python3 -m coverage run -a run_kraken.py -c CI/config/node_config.yaml
17 |   echo "Node scenario test: Success"
18 | }
19 | 
20 | funtional_test_node_crash
21 | 


--------------------------------------------------------------------------------
/CI/legacy/tests/test_shut_down.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_shut_down {
10 | 
11 |   export scenario_type="cluster_shut_down_scenarios"
12 |   export scenario_file="- CI/scenarios/cluster_shut_down_scenario.yml"
13 |   export post_config=""
14 |   envsubst < CI/config/common_test_config.yaml > CI/config/shut_down.yaml
15 |   python3 -m coverage run -a run_kraken.py -c CI/config/shut_down.yaml
16 |   echo "Cluster shut down scenario test: Success"
17 | }
18 | 
19 | functional_test_shut_down
20 | 


--------------------------------------------------------------------------------
/CI/legacy/tests/test_zone.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_zone_crash {
10 | 
11 |   export scenario_type="zone_outages"
12 |   export scenario_file="CI/scenarios/zone_outage_env.yaml"
13 |   export post_config=""
14 |   envsubst < CI/config/common_test_config.yaml > CI/config/zone3_config.yaml
15 |   envsubst < CI/scenarios/zone_outage.yaml > CI/scenarios/zone_outage_env.yaml
16 |   python3 -m coverage run -a run_kraken.py -c CI/config/zone3_config.yaml
17 |   echo "zone3 scenario test: Success"
18 | }
19 | 
20 | functional_test_zone_crash
21 | 


--------------------------------------------------------------------------------
/CI/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MAX_RETRIES=60
 3 | 
 4 | KUBECTL=`which kubectl 2>/dev/null`
 5 | [[ $? != 0 ]] && echo "[ERROR]: kubectl missing, please install it and try again" && exit 1
 6 | 
 7 | wait_cluster_become_ready() {
 8 |   COUNT=1
 9 |   until `$KUBECTL get namespace > /dev/null 2>&1`
10 |   do
11 |     echo "[INF] waiting Kubernetes to become ready, after $COUNT check"
12 |     sleep 3
13 |     [[ $COUNT == $MAX_RETRIES ]] && echo "[ERR] max retries exceeded, failing" && exit 1
14 |     ((COUNT++))
15 |   done 
16 | }
17 | 
18 | 
19 | 
20 | ci_tests_loc="CI/tests/functional_tests"
21 | 
22 | echo -e "********* Running Functional Tests Suite *********\n\n"
23 | 
24 | rm -rf CI/out
25 | 
26 | mkdir CI/out
27 | 
28 | results_file_name="results.markdown"
29 | 
30 | rm -f CI/$results_file_name
31 | 
32 | results="CI/$results_file_name"
33 | 
34 | # Prep the results.markdown file
35 | echo 'Test                   | Result | Duration' >> $results
36 | echo '-----------------------|--------|---------' >> $results
37 | 
38 | # Run each test
39 | failed_tests=()
40 | for test_name in `cat CI/tests/functional_tests`
41 | do
42 |   #wait_cluster_become_ready
43 |   return_value=`./CI/run_test.sh $test_name $results`
44 |   if [[ $return_value == 1 ]]
45 |   then
46 |     echo "Failed"
47 |     failed_tests+=("$test_name")
48 |   fi
49 |   wait_cluster_become_ready
50 | done
51 | 
52 | 
53 | if (( ${#failed_tests[@]}>0 ))
54 | then
55 |   echo -e "\n\n======================================================================"
56 |   echo -e "\n     FUNCTIONAL TESTS FAILED  ${failed_tests[*]} ABORTING"
57 |   echo -e "\n======================================================================\n\n"
58 | 
59 |   for test in "${failed_tests[@]}"
60 |   do
61 |     echo -e "\n********** $test KRKN RUN OUTPUT **********\n"
62 |     cat "CI/out/$test.out"
63 |     echo -e "\n********************************************\n\n\n\n"
64 |   done
65 | 
66 |   exit 1
67 | fi
68 | 


--------------------------------------------------------------------------------
/CI/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | readonly SECONDS_PER_HOUR=3600
 3 | readonly SECONDS_PER_MINUTE=60
 4 | function get_time_format() {
 5 |   seconds=$1
 6 |   hours=$((${seconds} / ${SECONDS_PER_HOUR}))
 7 |   seconds=$((${seconds} % ${SECONDS_PER_HOUR}))
 8 |   minutes=$((${seconds} / ${SECONDS_PER_MINUTE}))
 9 |   seconds=$((${seconds} % ${SECONDS_PER_MINUTE}))
10 |   echo $hours:$minutes:$seconds
11 | }
12 | ci_test=`echo $1`
13 | 
14 | results_file=$2
15 | 
16 | echo -e "test: ${ci_test}" >&2
17 | 
18 | ci_results="CI/out/$ci_test.out"
19 | # Test ci
20 | 
21 | echo "results $ci_results" >> $ci_results
22 | SECONDS=0
23 | if /bin/bash CI/tests/$ci_test.sh >> $ci_results 2>&1
24 | then
25 |   # if the test passes update the results and complete
26 |   duration=$SECONDS
27 |   duration=$(get_time_format $duration)
28 |   echo -e  "> $ci_test: Successful\n" >&2
29 |   echo "$ci_test | Pass | $duration" >> $results_file
30 |   count=$retries
31 |   # return value for run.sh
32 |   echo 0
33 | else
34 |   duration=$SECONDS
35 |   duration=$(get_time_format $duration)
36 |   echo -e "> $ci_test: Failed\n" >&2
37 |   echo "$ci_test | Fail | $duration" >> $results_file
38 |   # return value for run.sh
39 |   echo 1
40 | fi
41 | 


--------------------------------------------------------------------------------
/CI/templates/container_scenario_pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: container
 5 |   labels:
 6 |     scenario: container
 7 | spec:
 8 |   hostNetwork: true
 9 |   containers:
10 |   - name: fedtools
11 |     image: docker.io/fedora/tools
12 |     command:
13 |     - /bin/sh
14 |     - -c
15 |     - |
16 |       sleep infinity


--------------------------------------------------------------------------------
/CI/templates/outage_pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: outage
 5 |   labels:
 6 |     scenario: outage
 7 | spec:
 8 |   hostNetwork: true
 9 |   containers:
10 |   - name: fedtools
11 |     image: docker.io/fedora/tools
12 |     command:
13 |     - /bin/sh
14 |     - -c
15 |     - |
16 |       sleep infinity


--------------------------------------------------------------------------------
/CI/templates/service_hijacking.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nginx
 5 |   labels:
 6 |     app.kubernetes.io/name: proxy
 7 | spec:
 8 |   containers:
 9 |   - name: nginx
10 |     image: nginx:stable
11 |     ports:
12 |       - containerPort: 80
13 |         name: http-web-svc
14 | 
15 | ---
16 | apiVersion: v1
17 | kind: Service
18 | metadata:
19 |   name: nginx-service
20 | spec:
21 |   selector:
22 |     app.kubernetes.io/name: proxy
23 |   type: NodePort
24 |   ports:
25 |   - name: name-of-service-port
26 |     protocol: TCP
27 |     port: 80
28 |     targetPort: http-web-svc
29 |     nodePort: 30036


--------------------------------------------------------------------------------
/CI/templates/time_pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: time-skew
 5 |   labels:
 6 |     scenario: time-skew
 7 | spec:
 8 |   hostNetwork: true
 9 |   containers:
10 |   - name: fedtools
11 |     image: docker.io/fedora/tools
12 |     command:
13 |     - /bin/sh
14 |     - -c
15 |     - |
16 |       sleep infinity


--------------------------------------------------------------------------------
/CI/tests/common.sh:
--------------------------------------------------------------------------------
 1 | ERRORED=false
 2 | 
 3 | function finish {
 4 |     if [ $? != 0 ] && [ $ERRORED != "true" ]
 5 |     then
 6 |         error
 7 |     fi
 8 | }
 9 | 
10 | function error {
11 |     exit_code=$?
12 |     if [ $exit_code == 1 ]
13 |     then
14 |       echo "Error caught."
15 |       ERRORED=true
16 |     elif [ $exit_code == 2 ]
17 |     then
18 |       echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
19 |       exit 0
20 |     fi
21 | }
22 | 
23 | function get_node {
24 |   worker_node=$(kubectl get nodes --no-headers | grep worker | head -n 1)
25 |   export WORKER_NODE=$worker_node
26 | }
27 | 


--------------------------------------------------------------------------------
/CI/tests/functional_tests:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CI/tests/test_app_outages.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_app_outage {
10 |   yq -i '.application_outage.duration=10' scenarios/openshift/app_outage.yaml
11 |   yq -i '.application_outage.pod_selector={"scenario":"outage"}' scenarios/openshift/app_outage.yaml
12 |   yq -i '.application_outage.namespace="default"' scenarios/openshift/app_outage.yaml
13 |   export scenario_type="application_outages_scenarios"
14 |   export scenario_file="scenarios/openshift/app_outage.yaml"
15 |   export post_config=""
16 |   envsubst < CI/config/common_test_config.yaml > CI/config/app_outage.yaml
17 |   python3 -m coverage run -a run_kraken.py -c CI/config/app_outage.yaml
18 |   echo "App outage scenario test: Success"
19 | }
20 | 
21 | functional_test_app_outage
22 | 


--------------------------------------------------------------------------------
/CI/tests/test_container.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | pod_file="CI/scenarios/hello_pod.yaml"
 9 | 
10 | function functional_test_container_crash {
11 |   yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
12 |   yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
13 |   yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
14 |   export scenario_type="container_scenarios"
15 |   export scenario_file="scenarios/openshift/container_etcd.yml"
16 |   export post_config=""
17 |   envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
18 | 
19 |   python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml
20 |   echo "Container scenario test: Success"
21 | }
22 | 
23 | functional_test_container_crash
24 | 


--------------------------------------------------------------------------------
/CI/tests/test_cpu_hog.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_cpu_hog {
10 |   yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/cpu-hog.yml
11 | 
12 |   export scenario_type="hog_scenarios"
13 |   export scenario_file="scenarios/kube/cpu-hog.yml"
14 |   export post_config=""
15 |   envsubst < CI/config/common_test_config.yaml > CI/config/cpu_hog.yaml
16 |   python3 -m coverage run -a run_kraken.py -c CI/config/cpu_hog.yaml
17 |   echo "CPU Hog: Success"
18 | }
19 | 
20 | functional_test_cpu_hog


--------------------------------------------------------------------------------
/CI/tests/test_io_hog.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_io_hog {
10 |   yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/io-hog.yml
11 |   export scenario_type="hog_scenarios"
12 |   export scenario_file="scenarios/kube/io-hog.yml"
13 |   export post_config=""
14 |   envsubst < CI/config/common_test_config.yaml > CI/config/io_hog.yaml
15 |   python3 -m coverage run -a run_kraken.py -c CI/config/io_hog.yaml
16 |   echo "IO Hog: Success"
17 | }
18 | 
19 | functional_test_io_hog


--------------------------------------------------------------------------------
/CI/tests/test_memory_hog.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_memory_hog {
10 |   yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/memory-hog.yml
11 |   export scenario_type="hog_scenarios"
12 |   export scenario_file="scenarios/kube/memory-hog.yml"
13 |   export post_config=""
14 |   envsubst < CI/config/common_test_config.yaml > CI/config/memory_hog.yaml
15 |   python3 -m coverage run -a run_kraken.py -c CI/config/memory_hog.yaml
16 |   echo "Memory Hog: Success"
17 | }
18 | 
19 | functional_test_memory_hog


--------------------------------------------------------------------------------
/CI/tests/test_namespace.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | function funtional_test_namespace_deletion {
 9 |   export scenario_type="service_disruption_scenarios"
10 |   export scenario_file="scenarios/openshift/ingress_namespace.yaml"
11 |   export post_config=""
12 |   yq '.scenarios[0].namespace="^namespace-scenario$"' -i scenarios/openshift/ingress_namespace.yaml
13 |   yq '.scenarios[0].wait_time=30' -i scenarios/openshift/ingress_namespace.yaml
14 |   yq '.scenarios[0].action="delete"' -i scenarios/openshift/ingress_namespace.yaml
15 |   envsubst < CI/config/common_test_config.yaml > CI/config/namespace_config.yaml
16 |   python3 -m coverage run -a run_kraken.py -c CI/config/namespace_config.yaml
17 |   echo "Namespace scenario test: Success"
18 | }
19 | 
20 | funtional_test_namespace_deletion
21 | 


--------------------------------------------------------------------------------
/CI/tests/test_net_chaos.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_network_chaos {
10 |   yq -i '.network_chaos.duration=10' scenarios/openshift/network_chaos.yaml
11 |   yq -i '.network_chaos.node_name="kind-worker2"' scenarios/openshift/network_chaos.yaml
12 |   yq -i '.network_chaos.egress.bandwidth="100mbit"' scenarios/openshift/network_chaos.yaml
13 |   yq -i 'del(.network_chaos.interfaces)' scenarios/openshift/network_chaos.yaml
14 |   yq -i 'del(.network_chaos.label_selector)' scenarios/openshift/network_chaos.yaml
15 |   yq -i 'del(.network_chaos.egress.latency)' scenarios/openshift/network_chaos.yaml
16 |   yq -i 'del(.network_chaos.egress.loss)' scenarios/openshift/network_chaos.yaml
17 | 
18 |   export scenario_type="network_chaos_scenarios"
19 |   export scenario_file="scenarios/openshift/network_chaos.yaml"
20 |   export post_config=""
21 |   envsubst < CI/config/common_test_config.yaml > CI/config/network_chaos.yaml
22 |   python3 -m coverage run -a run_kraken.py -c CI/config/network_chaos.yaml
23 |   echo "Network Chaos test: Success"
24 | }
25 | 
26 | functional_test_network_chaos
27 | 


--------------------------------------------------------------------------------
/CI/tests/test_telemetry.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_telemetry {
10 |   AWS_CLI=`which aws`
11 |   [ -z "$AWS_CLI" ]&& echo "AWS cli not found in path" && exit 1
12 |   [ -z "$AWS_BUCKET" ] && echo "AWS bucket not set in environment" && exit 1
13 | 
14 |   export RUN_TAG="funtest-telemetry"
15 |   yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
16 |   yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
17 |   yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
18 |   yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
19 |   yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
20 | 
21 |   export scenario_type="hog_scenarios"
22 | 
23 |   export scenario_file="scenarios/kube/cpu-hog.yml"
24 | 
25 |   export post_config=""
26 |   envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
27 |   retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
28 |   RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
29 |   $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
30 |   echo "checking if telemetry files are uploaded on s3"
31 |   cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded"  && exit 1 )
32 |   cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded"  && exit 1 )
33 |   cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded"  && exit 1 )
34 |   echo "all files uploaded!"
35 |   echo "Telemetry Collection: Success"
36 | }
37 | 
38 | functional_test_telemetry


--------------------------------------------------------------------------------
/CI/tests/test_time.sh:
--------------------------------------------------------------------------------
 1 | set -xeEo pipefail
 2 | 
 3 | source CI/tests/common.sh
 4 | 
 5 | trap error ERR
 6 | trap finish EXIT
 7 | 
 8 | 
 9 | function functional_test_time_scenario {
10 |   yq -i '.time_scenarios[0].label_selector="scenario=time-skew"' scenarios/openshift/time_scenarios_example.yml
11 |   yq -i '.time_scenarios[0].container_name=""' scenarios/openshift/time_scenarios_example.yml
12 |   yq -i '.time_scenarios[0].namespace="default"' scenarios/openshift/time_scenarios_example.yml
13 |   yq -i '.time_scenarios[1].label_selector="kubernetes.io/hostname=kind-worker2"' scenarios/openshift/time_scenarios_example.yml
14 |   export scenario_type="time_scenarios"
15 |   export scenario_file="scenarios/openshift/time_scenarios_example.yml"
16 |   export post_config=""
17 |   envsubst < CI/config/common_test_config.yaml > CI/config/time_config.yaml
18 | 
19 |   python3 -m coverage run -a run_kraken.py -c CI/config/time_config.yaml
20 |   echo "Time scenario test: Success"
21 | }
22 | 
23 | functional_test_time_scenario
24 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | This document contains a list of maintainers in this repo.
 4 | 
 5 | ## Current Maintainers
 6 | 
 7 | | Maintainer          | GitHub ID                                                 | Email                   |
 8 | |---------------------| --------------------------------------------------------- | ----------------------- |
 9 | | Ravi Elluri         | [chaitanyaenr](https://github.com/chaitanyaenr)           | nelluri@redhat.com      |
10 | | Pradeep Surisetty   | [psuriset](https://github.com/psuriset)                   | psuriset@redhat.com     |
11 | | Paige Rubendall     | [paigerube14](https://github.com/paigerube14)             | prubenda@redhat.com     |
12 | | Tullio Sebastiani   | [tsebastiani](https://github.com/tsebastiani)             | tsebasti@redhat.com     |
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Krkn aka Kraken
 2 | ![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)
 3 | ![coverage](https://krkn-chaos.github.io/krkn-lib-docs/coverage_badge_krkn.svg)
 4 | ![action](https://github.com/krkn-chaos/krkn/actions/workflows/tests.yml/badge.svg)
 5 | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10548/badge)](https://www.bestpractices.dev/projects/10548)
 6 | 
 7 | ![Krkn logo](media/logo.png)
 8 | 
 9 | Chaos and resiliency testing tool for Kubernetes.
10 | Kraken injects deliberate failures into Kubernetes clusters to check if it is resilient to turbulent conditions.
11 | 
12 | 
13 | ### Workflow
14 | ![Kraken workflow](media/kraken-workflow.png) 
15 | 
16 | 
17 | <!-- ### Demo
18 | [![Kraken demo](media/KrakenStarting.png)](https://youtu.be/LN-fZywp_mo "Kraken Demo - Click to Watch!") -->
19 | 
20 | 
21 | ### How to Get Started
22 | Instructions on how to setup, configure and run Kraken can be found in the [documentation](https://krkn-chaos.dev/docs/).
23 | 
24 | 
25 | ### Blogs and other useful resources
26 | - Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes
27 | - Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift
28 | - Blog post emphasizing the importance of making Chaos part of Performance and Scale runs to mimic the production environments: https://www.openshift.com/blog/making-chaos-part-of-kubernetes/openshift-performance-and-scalability-tests
29 | - Blog post on findings from Chaos test runs: https://cloud.redhat.com/blog/openshift/kubernetes-chaos-stories
30 | - Discussion with CNCF TAG App Delivery on Krkn workflow, features and addition to CNCF sandbox: [Github](https://github.com/cncf/sandbox/issues/44), [Tracker](https://github.com/cncf/tag-app-delivery/issues/465), [recording](https://www.youtube.com/watch?v=nXQkBFK_MWc&t=722s)
31 | - Blog post on supercharging chaos testing using AI integration in Krkn: https://www.redhat.com/en/blog/supercharging-chaos-testing-using-ai
32 | - Blog post announcing Krkn joining CNCF Sandbox: https://www.redhat.com/en/blog/krknchaos-joining-cncf-sandbox
33 | 
34 | 
35 | ### Roadmap
36 | Enhancements being planned can be found in the [roadmap](ROADMAP.md).
37 | 
38 | 
39 | ### Contributions
40 | We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
41 | 
42 | [More information on how to Contribute](https://krkn-chaos.dev/docs/contribution-guidelines/)
43 | 
44 | 
45 | ### Community
46 | Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, tsebasti/Tullio Sebastiani, yogi/Yogananth Subramanian, sahil/Sahil Shah, pradeep/Pradeep Surisetty and ravielluri/Naga Ravi Chaitanya Elluri.
47 | * [**#krkn on Kubernetes Slack**](https://kubernetes.slack.com/messages/C05SFMHRWK1)
48 | 
49 | The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/legal/trademark-usage).
50 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | ## Krkn Roadmap
 2 | 
 3 | Following are a list of enhancements that we are planning to work on adding support in Krkn. Of course any help/contributions are greatly appreciated.
 4 | 
 5 | - [ ] [Ability to run multiple chaos scenarios in parallel under load to mimic real world outages](https://github.com/krkn-chaos/krkn/issues/424)
 6 | - [x] [Centralized storage for chaos experiments artifacts](https://github.com/krkn-chaos/krkn/issues/423)
 7 | - [ ] [Support for causing DNS outages](https://github.com/krkn-chaos/krkn/issues/394)
 8 | - [x] [Chaos recommender](https://github.com/krkn-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results 
 9 | - [] Chaos AI integration to improve test coverage while reducing fault space to save costs and execution time
10 | - [x] [Support for pod level network traffic shaping](https://github.com/krkn-chaos/krkn/issues/393)
11 | - [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/krkn-chaos/krkn/issues/124)
12 | - [x] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186
13 | - [x] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
14 | - [x] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495)
15 | - [x] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497)
16 | - [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl)
17 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | We attach great importance to code security. We are very grateful to the users, security vulnerability researchers, etc. for reporting security vulnerabilities to the Krkn community. All reported security vulnerabilities will be carefully assessed and addressed in a timely manner.
 4 | 
 5 | 
 6 | ## Security Checks
 7 | 
 8 | Krkn leverages [Snyk](https://snyk.io/) to ensure that any security vulnerabilities found 
 9 | in the code base and dependencies are fixed and published in the latest release. Security 
10 | vulnerability checks are enabled for each pull request to enable developers to get insights 
11 | and proactively fix them.
12 | 
13 |  
14 | ## Reporting a Vulnerability
15 | 
16 | The Krkn project treats security vulnerabilities seriously, so we
17 | strive to take action quickly when required.
18 | 
19 | The project requests that security issues be disclosed in a responsible
20 | manner to allow adequate time to respond.  If a security issue or
21 | vulnerability has been found, please disclose the details to our
22 | dedicated email address:
23 | 
24 | cncf-krkn-maintainers@lists.cncf.io
25 | 
26 | You can also use the [GitHub vulnerability report mechanism](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability#privately-reporting-a-security-vulnerability) to report the security vulnerability.
27 | 
28 | Please include as much information as possible with the report. The
29 | following details assist with analysis efforts:
30 |   - Description of the vulnerability
31 |   - Affected component (version, commit, branch etc)
32 |   - Affected code (file path, line numbers)
33 |   - Exploit code
34 | 
35 | 
36 | ## Security Team
37 | 
38 | The security team currently consists of the [Maintainers of Krkn](https://github.com/krkn-chaos/krkn/blob/main/MAINTAINERS.md)
39 | 
40 | 
41 | ## Process and Supported Releases
42 | 
43 | The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
44 | 


--------------------------------------------------------------------------------
/ansible/ansible.cfg:
--------------------------------------------------------------------------------
 1 | [defaults]
 2 | callback_whitelist = profile_tasks
 3 | host_key_checking = False
 4 | log_path = ~/ansible.log
 5 | retry_files_enabled = False
 6 | # work around privilege escalation timeouts in ansible:
 7 | timeout = 30
 8 | 
 9 | [callback_profile_tasks]
10 | task_output_limit = 10000
11 | sort_order = none
12 | 


--------------------------------------------------------------------------------
/ansible/inventory:
--------------------------------------------------------------------------------
1 | [orchestration]
2 | 


--------------------------------------------------------------------------------
/ansible/kraken.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: orchestration
 3 |   gather_facts: true
 4 |   remote_user: "{{ orchestration_user }}"
 5 |   vars_files:
 6 |    - vars/kraken_vars.yml
 7 | 
 8 |   tasks:
 9 |   - name: Git clone kraken repository
10 |     git:
11 |       repo: "{{ kraken_repository }}"
12 |       dest: "{{ kraken_dir }}"
13 |       force: yes
14 | 
15 |   - name: Generate kraken config file
16 |     template:
17 |       src: kraken.j2
18 |       dest: "{{ kraken_config }}"
19 | 
20 |   - name: Start injecting failures
21 |     shell: |
22 |       cd "{{ kraken_dir }}"
23 |       cp -r "{{ scenarios_folder_path }}"* scenarios/
24 |       unset CONFIG
25 |       python3 run_kraken.py
26 |     ignore_errors: yes
27 | 


--------------------------------------------------------------------------------
/ansible/templates/kraken.j2:
--------------------------------------------------------------------------------
 1 | kraken:
 2 |     kubeconfig_path: {{ kubeconfig_path }}                 # Path to kubeconfig
 3 |     exit_on_failure: {{ exit_on_failure }}                 # Exit when a post action scenario fails
 4 |     scenarios: {{ scenarios }}                             # List of policies/chaos scenarios to load
 5 | 
 6 | cerberus:
 7 |     cerberus_enabled: {{ cerberus_enabled }}               # Enable it when cerberus is previously installed
 8 |     cerberus_url: {{ cerberus_url }}                       # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
 9 | 
10 | tunings:
11 |     wait_duration: {{ wait_duration }}                     # Duration to wait between each chaos scenario
12 |     iterations: {{ iterations }}                           # Number of times to execute the scenarios
13 |     daemon_mode: {{ daemon_mode }}                         # Iterations are set to infinity which means that the cerberus will monitor the resources forever
14 | 


--------------------------------------------------------------------------------
/ansible/vars/kraken_vars.yml:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Ansible SSH variables.
 3 | ###############################################################################
 4 | ansible_public_key_file: "{{ lookup('env', 'PUBLIC_KEY')|default('~/.ssh/id_rsa.pub', true) }}"
 5 | ansible_private_key_file: "{{ lookup('env', 'PRIVATE_KEY')|default('~/.ssh/id_rsa', true) }}"
 6 | 
 7 | orchestration_user: "{{ lookup('env', 'ORCHESTRATION_USER')|default('root', true) }}"
 8 | ###############################################################################
 9 | 
10 | # kube config location
11 | kubeconfig_path: "{{ lookup('env', 'KUBECONFIG_PATH')|default('~/.kube/config', true) }}"
12 | 
13 | # kraken dir location on jump host
14 | kraken_dir: "{{ lookup('env', 'KRAKEN_DIR')|default('~/kraken', true) }}"
15 | 
16 | # kraken config path location
17 | kraken_config: "{{ lookup('env', 'KRAKEN_CONFIG')|default('~/kraken/config/config.yaml', true) }}"
18 | 
19 | # kraken repository location
20 | kraken_repository: "{{ lookup('env', 'KRAKEN_REPOSITORY')|default('https://github.com/openshift-scale/kraken.git', true) }}"
21 | 
22 | # scenarios to inject
23 | scenarios_folder_path: "{{ lookup('env', 'SCENARIOS_FOLDER_PATH')|default('CI/scenarios/', true) }}"
24 | scenarios: "{{ lookup('env', 'SCENARIOS')|default('[[scenarios/etcd.yml, scenarios/post_action_etcd_example.sh], [scenarios/openshift-apiserver.yml, scenarios/post_action_openshift-kube-apiserver.yml], [scenarios/openshift-kube-apiserver.yml, scenarios/post_action_openshift-apiserver.yml], [scenarios/regex_openshift_pod_kill.yml, scenarios/post_action_regex.py]]', true) }}"
25 | 
26 | exit_on_failure: "{{ lookup('env', 'EXIT_ON_FAILURE')|default(false, true) }}"
27 | 
28 | # Cerberus enabled by user
29 | cerberus_enabled: "{{ lookup('env', 'CERBERUS_ENABLED')|default(false, true) }}"
30 | cerberus_url: "{{ lookup('env', 'CERBERUS_URL')|default('', true) }}"
31 | 
32 | # Kraken configurations
33 | wait_duration: "{{ lookup('env', 'WAIT_DURATION')|default(60, true) }}"
34 | iterations: "{{ lookup('env', 'ITERATIONS')|default(1, true) }}"
35 | daemon_mode: "{{ lookup('env', 'DAEMON_MODE')|default(false, true) }}"
36 | 


--------------------------------------------------------------------------------
/config/cerberus.yaml:
--------------------------------------------------------------------------------
 1 | cerberus:
 2 |     distribution: openshift                              # Distribution can be kubernetes or openshift
 3 |     kubeconfig_path: ~/.kube/config                      # Path to kubeconfig
 4 |     port: 8080                                           # http server port where cerberus status is published
 5 |     watch_nodes: True                                    # Set to True for the cerberus to monitor the cluster nodes
 6 |     watch_cluster_operators: True                        # Set to True for cerberus to monitor cluster operators
 7 |     watch_url_routes:                                    # Route url's you want to monitor, this is a double array with the url and optional authorization parameter
 8 |     watch_master_schedulable:                            # When enabled checks for the schedulable master nodes with given label.
 9 |         enabled: True
10 |         label: node-role.kubernetes.io/master
11 |     watch_namespaces:                                    # List of namespaces to be monitored
12 |         -    openshift-etcd
13 |         -    openshift-apiserver
14 |         -    openshift-kube-apiserver
15 |         -    openshift-monitoring
16 |         -    openshift-kube-controller-manager
17 |         -    openshift-machine-api
18 |         -    openshift-kube-scheduler
19 |         -    openshift-ingress
20 |         -    openshift-sdn                               # When enabled, it will check for the cluster sdn and monitor that namespace
21 |     cerberus_publish_status: True                        # When enabled, cerberus starts a light weight http server and publishes the status
22 |     inspect_components: False                            # Enable it only when OpenShift client is supported to run
23 |                                                          # When enabled, cerberus collects logs, events and metrics of failed components
24 | 
25 |     prometheus_url:                                      # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
26 |     prometheus_bearer_token:                             # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
27 |                                                          # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies.
28 | 
29 |     slack_integration: False                             # When enabled, cerberus reports the failed iterations in the slack channel
30 |                                                          # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures )
31 |                                                          # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's.
32 |     watcher_slack_ID:                                        # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
33 |         Monday:
34 |         Tuesday:
35 |         Wednesday:
36 |         Thursday:
37 |         Friday:
38 |         Saturday:
39 |         Sunday:
40 |     slack_team_alias:                                    # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
41 | 
42 |     custom_checks:                                       # Relative paths of files conataining additional user defined checks
43 | 
44 | tunings:
45 |     timeout: 3                                          # Number of seconds before requests fail
46 |     iterations: 5                                        # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled
47 |     sleep_time: 5                                       # Sleep duration between each iteration
48 |     kube_api_request_chunk_size: 250                     # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness.
49 |     daemon_mode: True                                    # Iterations are set to infinity which means that the cerberus will monitor the resources forever
50 |     cores_usage_percentage: 0.5                          # Set the fraction of cores to be used for multiprocessing
51 | 
52 | database:
53 |     database_path: /tmp/cerberus.db                      # Path where cerberus database needs to be stored
54 |     reuse_database: False                                # When enabled, the database is reused to store the failures
55 | 


--------------------------------------------------------------------------------
/config/config_kind.yaml:
--------------------------------------------------------------------------------
 1 | kraken:
 2 |     distribution: kubernetes                               # Distribution can be kubernetes or openshift
 3 |     kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
 4 |     exit_on_failure: False                                 # Exit when a post action scenario fails
 5 |     port: 8081
 6 |     publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
 7 |     signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
 8 |     signal_address: 0.0.0.0                                # Signal listening address
 9 |     chaos_scenarios:                                       # List of policies/chaos scenarios to load
10 |         - plugin_scenarios:
11 |             - scenarios/kind/scheduler.yml
12 |         - node_scenarios:
13 |             - scenarios/kind/node_scenarios_example.yml        
14 | 
15 | cerberus:
16 |     cerberus_enabled: False                                # Enable it when cerberus is previously installed
17 |     cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
18 |     check_applicaton_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
19 | 
20 | performance_monitoring:
21 |     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
22 |     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
23 |     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
24 |     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
25 |     uuid:                                                 # uuid for the run is generated by default if not set
26 |     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
27 |     alert_profile: config/alerts.yaml                          # Path to alert profile with the prometheus queries
28 | 
29 | tunings:
30 |     wait_duration: 60                                      # Duration to wait between each chaos scenario
31 |     iterations: 1                                          # Number of times to execute the scenarios
32 |     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
33 | 


--------------------------------------------------------------------------------
/config/config_kubernetes.yaml:
--------------------------------------------------------------------------------
 1 | kraken:
 2 |     distribution: kubernetes                               # Distribution can be kubernetes or openshift
 3 |     kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
 4 |     exit_on_failure: False                                 # Exit when a post action scenario fails
 5 |     port: 8081
 6 |     publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
 7 |     signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
 8 |     chaos_scenarios:                                       # List of policies/chaos scenarios to load
 9 |         -   container_scenarios:                                 # List of chaos pod scenarios to load
10 |             - scenarios/kube/container_dns.yml
11 |         -   plugin_scenarios:
12 |             - scenarios/kube/scheduler.yml
13 | 
14 | cerberus:
15 |     cerberus_enabled: False                                # Enable it when cerberus is previously installed
16 |     cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
17 |     check_applicaton_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
18 | 
19 | performance_monitoring:
20 |     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
21 |     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
22 |     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
23 |     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
24 |     uuid:                                                 # uuid for the run is generated by default if not set
25 |     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
26 |     alert_profile: config/alerts.yaml                         # Path to alert profile with the prometheus queries
27 |     check_critical_alerts: False                          # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down
28 | tunings:
29 |     wait_duration: 60                                      # Duration to wait between each chaos scenario
30 |     iterations: 1                                          # Number of times to execute the scenarios
31 |     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
32 | 


--------------------------------------------------------------------------------
/config/metrics.yaml:
--------------------------------------------------------------------------------
 1 | metrics:
 2 | # API server
 3 |   - query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0
 4 |     metricName: schedulingThroughput
 5 | 
 6 | # Containers & pod metrics
 7 |   - query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
 8 |     metricName: podCPU
 9 | 
10 |   - query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node)
11 |     metricName: podMemory
12 | 
13 |   - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
14 |     metricName: containerDiskUsage
15 | 
16 | # Kubelet & CRI-O metrics
17 |   - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
18 |     metricName: kubeletCPU
19 | 
20 |   - query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"}
21 |     metricName: kubeletMemory
22 | 
23 |   - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
24 |     metricName: crioCPU
25 | 
26 |   - query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"}
27 |     metricName: crioMemory
28 | 
29 | # Node metrics
30 |   - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
31 |     metricName: nodeCPU-Masters
32 | 
33 |   - query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
34 |     metricName: nodeMemory-Masters
35 |   
36 |   - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0
37 |     metricName: nodeCPU-Workers
38 |   
39 |   - query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
40 |     metricName: nodeMemory-Workers
41 | 
42 |   - query: avg(node_memory_MemAvailable_bytes) by (instance)
43 |     metricName: nodeMemoryAvailable
44 | 
45 |   - query: avg(node_memory_Active_bytes) by (instance)
46 |     metricName: nodeMemoryActive
47 | 
48 |   - query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
49 |     metricName: maxMemory-Masters
50 | 
51 |   - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
52 |     metricName: nodeMemoryCached+nodeMemoryBuffers
53 | 
54 |   - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
55 |     metricName: rxNetworkBytes
56 | 
57 |   - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
58 |     metricName: txNetworkBytes
59 | 
60 |   - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m])
61 |     metricName: nodeDiskWrittenBytes
62 | 
63 |   - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m])
64 |     metricName: nodeDiskReadBytes
65 | 
66 |   - query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
67 |     metricName: etcdLeaderChangesRate
68 | 
69 | # Etcd metrics
70 |   - query: etcd_server_is_leader > 0
71 |     metricName: etcdServerIsLeader
72 | 
73 |   - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
74 |     metricName: 99thEtcdDiskBackendCommitDurationSeconds
75 | 
76 |   - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
77 |     metricName: 99thEtcdDiskWalFsyncDurationSeconds
78 | 
79 |   - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
80 |     metricName: 99thEtcdRoundTripTimeSeconds
81 | 
82 |   - query: etcd_mvcc_db_total_size_in_bytes
83 |     metricName: etcdDBPhysicalSizeBytes
84 | 
85 |   - query: etcd_mvcc_db_total_size_in_use_in_bytes
86 |     metricName: etcdDBLogicalSizeBytes
87 | 
88 |   - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
89 |     metricName: etcdObjectCount
90 | 
91 |   - query: sum by (cluster_version)(etcd_cluster_version)
92 |     metricName: etcdVersion
93 |     instant: true


--------------------------------------------------------------------------------
/config/recommender_config.yaml:
--------------------------------------------------------------------------------
 1 | application: openshift-etcd
 2 | namespaces: openshift-etcd
 3 | labels: app=openshift-etcd
 4 | kubeconfig: ~/.kube/config.yaml
 5 | prometheus_endpoint: <Prometheus_Endpoint>
 6 | auth_token: <Auth_Token>
 7 | scrape_duration: 10m
 8 | chaos_library: "kraken"
 9 | log_level: INFO
10 | json_output_file: False
11 | json_output_folder_path:
12 | 
13 | # for output purpose only do not change if not needed
14 | chaos_tests:
15 |   GENERIC:
16 |     - pod_failure
17 |     - container_failure
18 |     - node_failure
19 |     - zone_outage
20 |     - time_skew
21 |     - namespace_failure
22 |     - power_outage
23 |   CPU:
24 |     - node_cpu_hog
25 |   NETWORK:
26 |     - application_outage
27 |     - node_network_chaos
28 |     - pod_network_chaos
29 |   MEM:
30 |     - node_memory_hog
31 |     - pvc_disk_fill
32 | 
33 | threshold: .7
34 | cpu_threshold: .5
35 | mem_threshold: .5
36 | 


--------------------------------------------------------------------------------
/containers/Dockerfile.template:
--------------------------------------------------------------------------------
 1 | # oc build
 2 | FROM golang:1.23.1 AS oc-build
 3 | RUN apt-get update && apt-get install -y --no-install-recommends libkrb5-dev
 4 | WORKDIR /tmp
 5 | RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
 6 | WORKDIR /tmp/oc
 7 | RUN go mod edit -go 1.23.1 &&\
 8 |     go get github.com/moby/buildkit@v0.12.5 &&\
 9 |     go get github.com/containerd/containerd@v1.7.11&&\
10 |     go get github.com/docker/docker@v25.0.6&&\
11 |     go get github.com/opencontainers/runc@v1.1.14&&\
12 |     go get github.com/go-git/go-git/v5@v5.13.0&&\
13 |     go get golang.org/x/net@v0.36.0&&\
14 |     go get github.com/containerd/containerd@v1.7.27&&\
15 |     go get golang.org/x/oauth2@v0.27.0&&\
16 |     go get golang.org/x/crypto@v0.35.0&&\
17 |     go mod tidy && go mod vendor
18 | RUN make GO_REQUIRED_MIN_VERSION:= oc
19 | 
20 | FROM fedora:40
21 | ARG PR_NUMBER
22 | ARG TAG
23 | RUN groupadd -g 1001 krkn && useradd -m -u 1001 -g krkn krkn
24 | RUN dnf update -y
25 | 
26 | ENV KUBECONFIG /home/krkn/.kube/config
27 | 
28 | 
29 | # This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo
30 | RUN dnf update && dnf install -y --setopt=install_weak_deps=False \
31 |     git python39 jq yq gettext wget which &&\
32 |     dnf clean all
33 | 
34 | # copy oc client binary from oc-build image
35 | COPY --from=oc-build /tmp/oc/oc /usr/bin/oc
36 | 
37 | # krkn build
38 | RUN git clone https://github.com/krkn-chaos/krkn.git /home/krkn/kraken && \
39 |     mkdir -p /home/krkn/.kube
40 | 
41 | WORKDIR /home/krkn/kraken
42 | 
43 | # default behaviour will be to build main
44 | # if it is a PR trigger the PR itself will be checked out
45 | RUN if [ -n "$PR_NUMBER" ]; then git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} && git checkout pr-${PR_NUMBER};fi
46 | # if it is a TAG trigger checkout the tag
47 | RUN if [ -n "$TAG" ]; then git checkout "$TAG";fi
48 | 
49 | RUN python3.9 -m ensurepip --upgrade --default-pip
50 | RUN python3.9 -m pip install --upgrade pip setuptools==70.0.0
51 | RUN pip3.9 install -r requirements.txt
52 | RUN pip3.9 install jsonschema
53 | 
54 | LABEL krknctl.title.global="Krkn Base Image"
55 | LABEL krknctl.description.global="This is the krkn base image."
56 | LABEL krknctl.input_fields.global='$KRKNCTL_INPUT'
57 | 
58 | 
59 | RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn
60 | USER krkn
61 | ENTRYPOINT ["python3.9", "run_kraken.py"]
62 | CMD ["--config=config/config.yaml"]
63 | 


--------------------------------------------------------------------------------
/containers/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Kraken image
 3 | 
 4 | Container image gets automatically built by quay.io at [Kraken image](https://quay.io/redhat-chaos/krkn).
 5 | 
 6 | 
 7 | ### Run containerized version
 8 | 
 9 | Refer [instructions](https://krkn-chaos.dev/docs/installation/) for information on how to run the containerized version of kraken.
10 | 
11 | 
12 | ### Run Custom Kraken Image
13 | 
14 | Refer to [instructions](https://github.com/redhat-chaos/krkn/blob/main/containers/build_own_image-README.md) for information on how to run a custom containerized version of kraken using podman.
15 | 


--------------------------------------------------------------------------------
/containers/build_own_image-README.md:
--------------------------------------------------------------------------------
 1 | # Building your own Kraken image
 2 | 
 3 | 1. Git clone the Kraken repository using `git clone https://github.com/redhat-chaos/krkn.git`.
 4 | 2. Modify the python code and yaml files to address your needs.
 5 | 3. Execute `podman build -t <new_image_name>:latest .` in the containers directory within kraken to build an image from a Dockerfile.
 6 | 4. Execute `podman run --detach --name <container_name> <new_image_name>:latest` to start a container based on your new image.
 7 | 
 8 | # Building the Kraken image on IBM Power (ppc64le)
 9 | 
10 | 1. Git clone the Kraken repository using `git clone https://github.com/redhat-chaos/krkn.git` on an IBM Power Systems server.
11 | 2. Modify the python code and yaml files to address your needs.
12 | 3. Execute `podman build -t <new_image_name>:latest -f Dockerfile-ppc64le` in the containers directory within kraken to build an image from the Dockerfile for Power.
13 | 4. Execute `podman run --detach --name <container_name> <new_image_name>:latest` to start a container based on your new image.
14 | 


--------------------------------------------------------------------------------
/containers/compile_dockerfile.sh:
--------------------------------------------------------------------------------
1 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
2 | cd "$SCRIPT_DIR"
3 | export KRKNCTL_INPUT=$(cat krknctl-input.json|tr -d "\n")
4 | 
5 | envsubst '${KRKNCTL_INPUT}' < Dockerfile.template > Dockerfile


--------------------------------------------------------------------------------
/kind-config.yml:
--------------------------------------------------------------------------------
 1 | kind: Cluster
 2 | apiVersion: kind.x-k8s.io/v1alpha4
 3 | nodes:
 4 |   - role: control-plane
 5 |     extraPortMappings:
 6 |       - containerPort: 30036
 7 |         hostPort: 8888
 8 |   - role: control-plane
 9 |   - role: control-plane
10 |   - role: worker
11 |   - role: worker
12 | 


--------------------------------------------------------------------------------
/krkn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/__init__.py


--------------------------------------------------------------------------------
/krkn/cerberus/__init__.py:
--------------------------------------------------------------------------------
1 | from .setup import *
2 | 


--------------------------------------------------------------------------------
/krkn/chaos_recommender/__init__.py:
--------------------------------------------------------------------------------
1 | from .analysis import *
2 | from .kraken_tests import *
3 | from .prometheus import *


--------------------------------------------------------------------------------
/krkn/chaos_recommender/kraken_tests.py:
--------------------------------------------------------------------------------
 1 | def get_entries_by_category(filename, category):
 2 |     # Read the file
 3 |     with open(filename, "r") as file:
 4 |         content = file.read()
 5 | 
 6 |     # Split the content into sections based on the square brackets
 7 |     sections = content.split("\n\n")
 8 | 
 9 |     # Define the categories
10 |     valid_categories = ["CPU", "NETWORK", "MEM", "GENERIC"]
11 | 
12 |     # Validate the provided category
13 |     if category not in valid_categories:
14 |         return []
15 | 
16 |     # Find the section corresponding to the specified category
17 |     target_section = None
18 |     for section in sections:
19 |         if section.startswith(f"[{category}]"):
20 |             target_section = section
21 |             break
22 | 
23 |     # If the category section was not found, return an empty list
24 |     if target_section is None:
25 |         return []
26 | 
27 |     # Extract the entries from the category section
28 |     entries = [
29 |         entry.strip()
30 |         for entry in target_section.split("\n")
31 |         if entry and not entry.startswith("[")
32 |     ]
33 | 
34 |     return entries
35 | 


--------------------------------------------------------------------------------
/krkn/invoke/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/invoke/__init__.py


--------------------------------------------------------------------------------
/krkn/invoke/command.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import logging
 3 | import sys
 4 | 
 5 | 
 6 | # Invokes a given command and returns the stdout
 7 | def invoke(command, timeout=None):
 8 |     output = ""
 9 |     try:
10 |         output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
11 |     except Exception as e:
12 |         logging.error("Failed to run %s, error: %s" % (command, e))
13 |         sys.exit(1)
14 |     return output
15 | 
16 | 
17 | # Invokes a given command and returns the stdout
18 | def invoke_no_exit(command, timeout=None):
19 |     output = ""
20 |     try:
21 |         output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
22 |         logging.info("output " + str(output))
23 |     except Exception as e:
24 |         logging.error("Failed to run %s, error: %s" % (command, e))
25 |         return str(e)
26 |     return output
27 | 
28 | 
29 | def run(command):
30 |     try:
31 |         subprocess.run(command, shell=True, universal_newlines=True, timeout=45)
32 |     except Exception:
33 |         pass
34 | 


--------------------------------------------------------------------------------
/krkn/performance_dashboards/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/performance_dashboards/__init__.py


--------------------------------------------------------------------------------
/krkn/performance_dashboards/setup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import logging
 3 | import git
 4 | import sys
 5 | 
 6 | 
 7 | # Installs a mutable grafana on the Kubernetes/OpenShift cluster and loads the performance dashboards
 8 | def setup(repo, distribution):
 9 |     if distribution == "kubernetes":
10 |         command = "cd performance-dashboards/dittybopper && ./k8s-deploy.sh"
11 |     elif distribution == "openshift":
12 |         command = "cd performance-dashboards/dittybopper && ./deploy.sh"
13 |     else:
14 |         logging.error("Provided distribution: %s is not supported" % (distribution))
15 |         sys.exit(1)
16 |     delete_repo = "rm -rf performance-dashboards || exit 0"
17 |     logging.info(
18 |         "Cloning, installing mutable grafana on the cluster and loading the dashboards"
19 |     )
20 |     try:
21 |         # delete repo to clone the latest copy if exists
22 |         subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45)
23 |         # clone the repo
24 |         git.Repo.clone_from(repo, "performance-dashboards")
25 |         # deploy performance dashboards
26 |         subprocess.run(command, shell=True, universal_newlines=True)
27 |     except Exception as e:
28 |         logging.error("Failed to install performance-dashboards, error: %s" % (e))
29 | 


--------------------------------------------------------------------------------
/krkn/prometheus/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import *


--------------------------------------------------------------------------------
/krkn/scenario_plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/application_outage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/application_outage/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | import yaml
 4 | from krkn_lib.models.telemetry import ScenarioTelemetry
 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 6 | from krkn_lib.utils import get_yaml_item_value, get_random_string
 7 | from jinja2 import Template
 8 | from krkn import cerberus
 9 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
10 | 
11 | 
12 | class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
13 |     def run(
14 |         self,
15 |         run_uuid: str,
16 |         scenario: str,
17 |         krkn_config: dict[str, any],
18 |         lib_telemetry: KrknTelemetryOpenshift,
19 |         scenario_telemetry: ScenarioTelemetry,
20 |     ) -> int:
21 |         wait_duration = krkn_config["tunings"]["wait_duration"]
22 |         try:
23 |             with open(scenario, "r") as f:
24 |                 app_outage_config_yaml = yaml.full_load(f)
25 |                 scenario_config = app_outage_config_yaml["application_outage"]
26 |                 pod_selector = get_yaml_item_value(
27 |                     scenario_config, "pod_selector", "{}"
28 |                 )
29 |                 traffic_type = get_yaml_item_value(
30 |                     scenario_config, "block", "[Ingress, Egress]"
31 |                 )
32 |                 namespace = get_yaml_item_value(scenario_config, "namespace", "")
33 |                 duration = get_yaml_item_value(scenario_config, "duration", 60)
34 | 
35 |                 start_time = int(time.time())
36 |                 policy_name = f"krkn-deny-{get_random_string(5)}"
37 | 
38 |                 network_policy_template = (
39 |                     """---
40 |         apiVersion: networking.k8s.io/v1
41 |         kind: NetworkPolicy
42 |         metadata:
43 |           name: """
44 |                     + policy_name
45 |                     + """
46 |         spec:
47 |           podSelector:
48 |             matchLabels: {{ pod_selector }}
49 |           policyTypes: {{ traffic_type }}
50 |         """
51 |                 )
52 |                 t = Template(network_policy_template)
53 |                 rendered_spec = t.render(
54 |                     pod_selector=pod_selector, traffic_type=traffic_type
55 |                 )
56 |                 yaml_spec = yaml.safe_load(rendered_spec)
57 |                 # Block the traffic by creating network policy
58 |                 logging.info("Creating the network policy")
59 | 
60 |                 lib_telemetry.get_lib_kubernetes().create_net_policy(
61 |                     yaml_spec, namespace
62 |                 )
63 | 
64 |                 # wait for the specified duration
65 |                 logging.info(
66 |                     "Waiting for the specified duration in the config: %s" % duration
67 |                 )
68 |                 time.sleep(duration)
69 | 
70 |                 # unblock the traffic by deleting the network policy
71 |                 logging.info("Deleting the network policy")
72 |                 lib_telemetry.get_lib_kubernetes().delete_net_policy(
73 |                     policy_name, namespace
74 |                 )
75 | 
76 |                 logging.info(
77 |                     "End of scenario. Waiting for the specified duration: %s"
78 |                     % wait_duration
79 |                 )
80 |                 time.sleep(wait_duration)
81 | 
82 |                 end_time = int(time.time())
83 |                 cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
84 |         except Exception as e:
85 |             logging.error(
86 |                 "ApplicationOutageScenarioPlugin exiting due to Exception %s" % e
87 |             )
88 |             return 1
89 |         else:
90 |             return 0
91 | 
92 |     def get_scenario_types(self) -> list[str]:
93 |         return ["application_outages_scenarios"]
94 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/container/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/container/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/hogs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/hogs/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/managed_cluster/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/managed_cluster/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/managed_cluster/common_functions.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import logging
 3 | from krkn_lib.k8s import KrknKubernetes
 4 | 
 5 | 
 6 | # krkn_lib
 7 | # Pick a random managedcluster with specified label selector
 8 | def get_managedcluster(
 9 |     managedcluster_name, label_selector, instance_kill_count, kubecli: KrknKubernetes
10 | ):
11 | 
12 |     if managedcluster_name in kubecli.list_killable_managedclusters():
13 |         return [managedcluster_name]
14 |     elif managedcluster_name:
15 |         logging.info(
16 |             "managedcluster with provided managedcluster_name does not exist or the managedcluster might "
17 |             "be in unavailable state."
18 |         )
19 |     managedclusters = kubecli.list_killable_managedclusters(label_selector)
20 |     if not managedclusters:
21 |         raise Exception(
22 |             "Available managedclusters with the provided label selector do not exist"
23 |         )
24 |     logging.info(
25 |         "Available managedclusters with the label selector %s: %s"
26 |         % (label_selector, managedclusters)
27 |     )
28 |     number_of_managedclusters = len(managedclusters)
29 |     if instance_kill_count == number_of_managedclusters:
30 |         return managedclusters
31 |     managedclusters_to_return = []
32 |     for i in range(instance_kill_count):
33 |         managedcluster_to_add = managedclusters[
34 |             random.randint(0, len(managedclusters) - 1)
35 |         ]
36 |         managedclusters_to_return.append(managedcluster_to_add)
37 |         managedclusters.remove(managedcluster_to_add)
38 |     return managedclusters_to_return
39 | 
40 | 
41 | # Wait until the managedcluster status becomes Available
42 | # krkn_lib
43 | def wait_for_available_status(managedcluster, timeout, kubecli: KrknKubernetes):
44 |     kubecli.watch_managedcluster_status(managedcluster, "True", timeout)
45 | 
46 | 
47 | # Wait until the managedcluster status becomes Not Available
48 | # krkn_lib
49 | def wait_for_unavailable_status(managedcluster, timeout, kubecli: KrknKubernetes):
50 |     kubecli.watch_managedcluster_status(managedcluster, "Unknown", timeout)
51 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/native/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/native_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 2 | from krkn.scenario_plugins.native.plugins import PLUGINS
 3 | from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
 4 | from krkn_lib.models.telemetry import ScenarioTelemetry
 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 6 | from typing import Any
 7 | import logging
 8 | 
 9 | 
10 | class NativeScenarioPlugin(AbstractScenarioPlugin):
11 | 
12 |     def run(
13 |         self,
14 |         run_uuid: str,
15 |         scenario: str,
16 |         krkn_config: dict[str, any],
17 |         lib_telemetry: KrknTelemetryOpenshift,
18 |         scenario_telemetry: ScenarioTelemetry,
19 |     ) -> int:
20 |         pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes())
21 |         kill_scenarios = [
22 |             kill_scenario
23 |             for kill_scenario in PLUGINS.unserialize_scenario(scenario)
24 |             if kill_scenario["id"] == "kill-pods"
25 |         ]
26 | 
27 |         try:
28 |             self.start_monitoring(pool, kill_scenarios)
29 |             PLUGINS.run(
30 |                 scenario,
31 |                 lib_telemetry.get_lib_kubernetes().get_kubeconfig_path(),
32 |                 krkn_config,
33 |                 run_uuid,
34 |             )
35 |             result = pool.join()
36 |             scenario_telemetry.affected_pods = result
37 |             if result.error:
38 |                 logging.error(f"NativeScenarioPlugin unrecovered pods: {result.error}")
39 |                 return 1
40 | 
41 |         except Exception as e:
42 |             logging.error("NativeScenarioPlugin exiting due to Exception %s" % e)
43 |             pool.cancel()
44 |             return 1
45 |         else:
46 |             return 0
47 | 
48 |     def get_scenario_types(self) -> list[str]:
49 |         return [
50 |             "pod_disruption_scenarios",
51 |             "pod_network_scenarios",
52 |             "ingress_node_scenarios"
53 |         ]
54 | 
55 |     def start_monitoring(self, pool: PodsMonitorPool, scenarios: list[Any]):
56 |         for kill_scenario in scenarios:
57 |             recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
58 |             if (
59 |                 "namespace_pattern" in kill_scenario["config"]
60 |                 and "label_selector" in kill_scenario["config"]
61 |             ):
62 |                 namespace_pattern = kill_scenario["config"]["namespace_pattern"]
63 |                 label_selector = kill_scenario["config"]["label_selector"]
64 |                 pool.select_and_monitor_by_namespace_pattern_and_label(
65 |                     namespace_pattern=namespace_pattern,
66 |                     label_selector=label_selector,
67 |                     max_timeout=recovery_time,
68 |                 )
69 |                 logging.info(
70 |                     f"waiting {recovery_time} seconds for pod recovery, "
71 |                     f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}"
72 |                 )
73 | 
74 |             elif (
75 |                 "namespace_pattern" in kill_scenario["config"]
76 |                 and "name_pattern" in kill_scenario["config"]
77 |             ):
78 |                 namespace_pattern = kill_scenario["config"]["namespace_pattern"]
79 |                 name_pattern = kill_scenario["config"]["name_pattern"]
80 |                 pool.select_and_monitor_by_name_pattern_and_namespace_pattern(
81 |                     pod_name_pattern=name_pattern,
82 |                     namespace_pattern=namespace_pattern,
83 |                     max_timeout=recovery_time,
84 |                 )
85 |                 logging.info(
86 |                     f"waiting {recovery_time} seconds for pod recovery, "
87 |                     f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}"
88 |                 )
89 |             else:
90 |                 raise Exception(
91 |                     f"impossible to determine monitor parameters, check {kill_scenario} configuration"
92 |                 )
93 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/network/job.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: chaos-{{jobname}}
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       nodeName: {{nodename}}
 9 |       hostNetwork: true
10 |       containers:
11 |       - name: networkchaos
12 |         image: docker.io/fedora/tools
13 |         command: ["/bin/sh",  "-c", "{{cmd}}"]
14 |         securityContext:
15 |           privileged: true
16 |         volumeMounts:
17 |           - mountPath: /lib/modules
18 |             name: lib-modules
19 |             readOnly: true
20 |       volumes:
21 |         - name: lib-modules
22 |           hostPath:
23 |             path: /lib/modules
24 |       restartPolicy: Never
25 |   backoffLimit: 0


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/network/pod_interface.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: fedtools
 5 | spec:
 6 |   hostNetwork: true
 7 |   nodeName: {{nodename}}
 8 |   containers:
 9 |   - name: fedtools
10 |     image: docker.io/fedora/tools
11 |     command:
12 |     - /bin/sh
13 |     - -c
14 |     - "trap : TERM INT; sleep infinity & wait"
15 |     securityContext:
16 |       privileged: true
17 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/network/pod_module.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: modtools
 5 | spec:
 6 |   nodeName: {{nodename}}
 7 |   containers:
 8 |   - name: modtools
 9 |     image: docker.io/fedora/tools
10 |     imagePullPolicy: IfNotPresent
11 |     command:
12 |     - /bin/sh
13 |     - -c
14 |     - "trap : TERM INT; sleep infinity & wait"
15 |     tty: true
16 |     stdin: true
17 |     stdinOnce: true
18 |     securityContext:
19 |       privileged: true
20 |     volumeMounts:
21 |     - name: host
22 |       mountPath: /host
23 |   volumes:
24 |   - name: host
25 |     hostPath:
26 |       path: /
27 |   hostNetwork: true
28 |   hostIPC: true
29 |   hostPID: true
30 |   restartPolicy: Never


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/pod_network_outage/job.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: chaos-{{jobname}}
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       nodeName: {{nodename}}
 9 |       hostNetwork: true
10 |       containers:
11 |       - name: networkchaos
12 |         image: docker.io/fedora/tools
13 |         command: ["chroot", "/host", "/bin/sh",  "-c", "{{cmd}}"]
14 |         securityContext:
15 |           privileged: true
16 |         volumeMounts:
17 |         - name: host
18 |           mountPath: /host
19 |       volumes:
20 |       - name: host
21 |         hostPath:
22 |           path: /
23 | 
24 |       restartPolicy: Never
25 |   backoffLimit: 0
26 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/pod_network_outage/pod_module.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: modtools
 5 | spec:
 6 |   nodeName: {{nodename}}
 7 |   containers:
 8 |   - name: modtools
 9 |     image: docker.io/fedora/tools
10 |     imagePullPolicy: IfNotPresent
11 |     command:
12 |     - /bin/sh
13 |     - -c
14 |     - "trap : TERM INT; sleep infinity & wait"
15 |     tty: true
16 |     stdin: true
17 |     stdinOnce: true
18 |     securityContext:
19 |       privileged: true
20 |     volumeMounts:
21 |     - name: host
22 |       mountPath: /host
23 |   volumes:
24 |   - name: host
25 |     hostPath:
26 |       path: /
27 |   hostNetwork: true
28 |   hostIPC: true
29 |   hostPID: true
30 |   restartPolicy: Never


--------------------------------------------------------------------------------
/krkn/scenario_plugins/native/run_python_plugin.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import subprocess
 3 | import sys
 4 | import typing
 5 | 
 6 | from arcaflow_plugin_sdk import plugin
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class RunPythonFileInput:
11 |     filename: str
12 | 
13 | 
14 | @dataclasses.dataclass
15 | class RunPythonFileOutput:
16 |     stdout: str
17 |     stderr: str
18 | 
19 | 
20 | @dataclasses.dataclass
21 | class RunPythonFileError:
22 |     exit_code: int
23 |     stdout: str
24 |     stderr: str
25 | 
26 | 
27 | @plugin.step(
28 |     id="run_python",
29 |     name="Run a Python script",
30 |     description="Run a specified Python script",
31 |     outputs={"success": RunPythonFileOutput, "error": RunPythonFileError}
32 | )
33 | def run_python_file(params: RunPythonFileInput) -> typing.Tuple[
34 |     str,
35 |     typing.Union[RunPythonFileOutput, RunPythonFileError]
36 | ]:
37 |     run_results = subprocess.run(
38 |         [sys.executable, params.filename],
39 |         capture_output=True
40 |     )
41 |     if run_results.returncode == 0:
42 |         return "success", RunPythonFileOutput(
43 |             str(run_results.stdout, 'utf-8'),
44 |             str(run_results.stderr, 'utf-8')
45 |         )
46 |     return "error", RunPythonFileError(
47 |         run_results.returncode,
48 |         str(run_results.stdout, 'utf-8'),
49 |         str(run_results.stderr, 'utf-8')
50 |     )
51 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos/job.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: chaos-{{jobname}}
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       nodeName: {{nodename}}
 9 |       hostNetwork: true
10 |       containers:
11 |       - name: networkchaos
12 |         image: docker.io/fedora/tools
13 |         command: ["/bin/sh",  "-c", "{{cmd}}"]
14 |         securityContext:
15 |           privileged: true
16 |         volumeMounts:
17 |           - mountPath: /lib/modules
18 |             name: lib-modules
19 |             readOnly: true
20 |       volumes:
21 |         - name: lib-modules
22 |           hostPath:
23 |             path: /lib/modules
24 |       restartPolicy: Never
25 |   backoffLimit: 0
26 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos/pod.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: fedtools
 5 | spec:
 6 |   hostNetwork: true
 7 |   nodeName: {{nodename}}
 8 |   containers:
 9 |   - name: fedtools
10 |     image: docker.io/fedora/tools
11 |     command:
12 |     - /bin/sh
13 |     - -c
14 |     - |
15 |       sleep infinity
16 |     securityContext:
17 |       privileged: true
18 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos_ng/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | 
 4 | 
 5 | class NetworkChaosScenarioType(Enum):
 6 |     Node = 1
 7 |     Pod = 2
 8 | 
 9 | @dataclass
10 | class BaseNetworkChaosConfig:
11 |     supported_execution = ["serial", "parallel"]
12 |     id: str
13 |     wait_duration: int
14 |     test_duration: int
15 |     label_selector: str
16 |     instance_count: int
17 |     execution: str
18 |     namespace: str
19 | 
20 |     def validate(self) -> list[str]:
21 |         errors = []
22 |         if self.execution is None:
23 |             errors.append(f"execution cannot be None, supported values are: {','.join(self.supported_execution)}")
24 |         if self.execution not in self.supported_execution:
25 |             errors.append(f"{self.execution} is not in supported execution mod: {','.join(self.supported_execution)}")
26 |         if self.label_selector is None:
27 |             errors.append("label_selector cannot be None")
28 |         return errors
29 | 
30 | @dataclass
31 | class NetworkFilterConfig(BaseNetworkChaosConfig):
32 |     ingress: bool
33 |     egress: bool
34 |     interfaces: list[str]
35 |     target: str
36 |     ports: list[int]
37 | 
38 |     def validate(self) -> list[str]:
39 |         errors = super().validate()
40 |         # here further validations
41 |         return errors
42 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/network_chaos_ng/modules/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import logging
 3 | import queue
 4 | 
 5 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 6 | from krkn.scenario_plugins.network_chaos_ng.models import BaseNetworkChaosConfig, NetworkChaosScenarioType
 7 | 
 8 | 
 9 | class AbstractNetworkChaosModule(abc.ABC):
10 |     """
11 |     The abstract class that needs to be implemented by each Network Chaos Scenario
12 |     """
13 |     @abc.abstractmethod
14 |     def run(self, target: str, kubecli: KrknTelemetryOpenshift, error_queue: queue.Queue = None):
15 |         """
16 |         the entrypoint method for the Network Chaos Scenario
17 |         :param target: The resource name that will be targeted by the scenario (Node Name, Pod Name etc.)
18 |         :param kubecli: The `KrknTelemetryOpenshift` needed by the scenario to access to the krkn-lib methods
19 |         :param error_queue: A queue that will be used by the plugin to push the errors raised during the execution of parallel modules
20 |         """
21 |         pass
22 | 
23 |     @abc.abstractmethod
24 |     def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
25 |         """
26 |         returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network
27 |         Chaos Scenario that is running (Pod Scenario or Node Scenario)
28 |         """
29 |         pass
30 | 
31 | 
32 |     def log_info(self, message: str, parallel: bool = False, node_name: str = ""):
33 |         """
34 |         log helper method for INFO severity to be used in the scenarios
35 |         """
36 |         if parallel:
37 |             logging.info(f"[{node_name}]: {message}")
38 |         else:
39 |             logging.info(message)
40 | 
41 |     def log_warning(self, message: str, parallel: bool = False, node_name: str = ""):
42 |         """
43 |         log helper method for WARNING severity to be used in the scenarios
44 |         """
45 |         if parallel:
46 |             logging.warning(f"[{node_name}]: {message}")
47 |         else:
48 |             logging.warning(message)
49 | 
50 | 
51 |     def log_error(self, message: str, parallel: bool = False, node_name: str = ""):
52 |         """
53 |         log helper method for ERROR severity to be used in the scenarios
54 |         """
55 |         if parallel:
56 |             logging.error(f"[{node_name}]: {message}")
57 |         else:
58 |             logging.error(message)


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/modules/templates/network-chaos.j2:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: {{pod_name}}
 5 |   namespace: {{namespace}}
 6 | spec:
 7 |   {% if host_network %}
 8 |   hostNetwork: true
 9 |   {%endif%}
10 |   nodeSelector:
11 |     kubernetes.io/hostname: {{target}}
12 |   containers:
13 |   - name: fedora
14 |     imagePullPolicy: Always
15 |     image: quay.io/krkn-chaos/krkn-network-chaos:latest
16 |     securityContext:
17 |       privileged: true
18 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py:
--------------------------------------------------------------------------------
 1 | from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
 2 | from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import AbstractNetworkChaosModule
 3 | from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import NodeNetworkFilterModule
 4 | 
 5 | 
 6 | supported_modules = ["node_network_filter"]
 7 | 
 8 | class NetworkChaosFactory:
 9 | 
10 |     @staticmethod
11 |     def get_instance(config: dict[str, str]) -> AbstractNetworkChaosModule:
12 |         if config["id"] is None:
13 |             raise Exception("network chaos id cannot be None")
14 |         if config["id"] not in supported_modules:
15 |             raise Exception(f"{config['id']} is not a supported network chaos module")
16 | 
17 |         if config["id"] == "node_network_filter":
18 |             config = NetworkFilterConfig(**config)
19 |             errors = config.validate()
20 |             if len(errors) > 0:
21 |                 raise Exception(f"config validation errors: [{';'.join(errors)}]")
22 |             return NodeNetworkFilterModule(config)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/network_chaos_ng/network_chaos_ng_scenario_plugin.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import queue
  3 | import random
  4 | import threading
  5 | import time
  6 | 
  7 | import yaml
  8 | from krkn_lib.models.telemetry import ScenarioTelemetry
  9 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 10 | 
 11 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 12 | from krkn.scenario_plugins.network_chaos_ng.models import (
 13 |     NetworkChaosScenarioType,
 14 |     BaseNetworkChaosConfig,
 15 | )
 16 | from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
 17 |     AbstractNetworkChaosModule,
 18 | )
 19 | from krkn.scenario_plugins.network_chaos_ng.network_chaos_factory import (
 20 |     NetworkChaosFactory,
 21 | )
 22 | 
 23 | 
 24 | class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
 25 |     def run(
 26 |         self,
 27 |         run_uuid: str,
 28 |         scenario: str,
 29 |         krkn_config: dict[str, any],
 30 |         lib_telemetry: KrknTelemetryOpenshift,
 31 |         scenario_telemetry: ScenarioTelemetry,
 32 |     ) -> int:
 33 |         try:
 34 |             with open(scenario, "r") as file:
 35 |                 scenario_config = yaml.safe_load(file)
 36 |                 if not isinstance(scenario_config, list):
 37 |                     logging.error(
 38 |                         "network chaos scenario config must be a list of objects"
 39 |                     )
 40 |                     return 1
 41 |                 for config in scenario_config:
 42 |                     network_chaos = NetworkChaosFactory.get_instance(config)
 43 |                     network_chaos_config = network_chaos.get_config()
 44 |                     logging.info(
 45 |                         f"running network_chaos scenario: {network_chaos_config[1].id}"
 46 |                     )
 47 |                     if network_chaos_config[0] == NetworkChaosScenarioType.Node:
 48 |                         targets = lib_telemetry.get_lib_kubernetes().list_nodes(
 49 |                             network_chaos_config[1].label_selector
 50 |                         )
 51 |                     else:
 52 |                         targets = lib_telemetry.get_lib_kubernetes().list_pods(
 53 |                             network_chaos_config[1].namespace,
 54 |                             network_chaos_config[1].label_selector,
 55 |                         )
 56 |                     if len(targets) == 0:
 57 |                         logging.warning(
 58 |                             f"no targets found for {network_chaos_config[1].id} "
 59 |                             f"network chaos scenario with selector {network_chaos_config[1].label_selector} "
 60 |                             f"with target type {network_chaos_config[0]}"
 61 |                         )
 62 | 
 63 |                     if network_chaos_config[1].instance_count != 0 and network_chaos_config[1].instance_count > len(targets):
 64 |                         targets = random.sample(targets, network_chaos_config[1].instance_count)
 65 | 
 66 |                     if network_chaos_config[1].execution == "parallel":
 67 |                         self.run_parallel(targets, network_chaos, lib_telemetry)
 68 |                     else:
 69 |                         self.run_serial(targets, network_chaos, lib_telemetry)
 70 |                     if len(config) > 1:
 71 |                         logging.info(f"waiting {network_chaos_config[1].wait_duration} seconds before running the next "
 72 |                                      f"Network Chaos NG Module")
 73 |                         time.sleep(network_chaos_config[1].wait_duration)
 74 |         except Exception as e:
 75 |             logging.error(str(e))
 76 |             return 1
 77 |         return 0
 78 | 
 79 |     def run_parallel(
 80 |         self,
 81 |         targets: list[str],
 82 |         module: AbstractNetworkChaosModule,
 83 |         lib_telemetry: KrknTelemetryOpenshift,
 84 |     ):
 85 |         error_queue = queue.Queue()
 86 |         threads = []
 87 |         errors = []
 88 |         for target in targets:
 89 |             thread = threading.Thread(
 90 |                 target=module.run, args=[target, lib_telemetry, error_queue]
 91 |             )
 92 |             thread.start()
 93 |             threads.append(thread)
 94 |         for thread in threads:
 95 |             thread.join()
 96 |         while True:
 97 |             try:
 98 |                 errors.append(error_queue.get_nowait())
 99 |             except queue.Empty:
100 |                 break
101 |         if len(errors) > 0:
102 |             raise Exception(
103 |                 f"module {module.get_config()[1].id} execution failed: [{';'.join(errors)}]"
104 |             )
105 | 
106 |     def run_serial(
107 |         self,
108 |         targets: list[str],
109 |         module: AbstractNetworkChaosModule,
110 |         lib_telemetry: KrknTelemetryOpenshift,
111 |     ):
112 |         for target in targets:
113 |             module.run(target, lib_telemetry)
114 | 
115 |     def get_scenario_types(self) -> list[str]:
116 |         return ["network_chaos_ng_scenarios"]
117 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/node_actions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/node_actions/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/node_actions/common_node_functions.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | import random
  4 | import logging
  5 | import paramiko
  6 | from krkn_lib.models.k8s import AffectedNode
  7 | import krkn.invoke.command as runcommand
  8 | from krkn_lib.k8s import KrknKubernetes
  9 | from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
 10 | from krkn_lib.models.k8s import AffectedNode
 11 | 
 12 | node_general = False
 13 | 
 14 | 
 15 | def get_node_by_name(node_name_list, kubecli: KrknKubernetes):
 16 |     killable_nodes = kubecli.list_killable_nodes()
 17 |     for node_name in node_name_list:
 18 |         if node_name not in killable_nodes:
 19 |             logging.info(
 20 |                 f"Node with provided ${node_name} does not exist or the node might "
 21 |                 "be in NotReady state."
 22 |             )
 23 |             return
 24 |     return node_name_list
 25 |         
 26 | 
 27 | # Pick a random node with specified label selector
 28 | def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes):
 29 | 
 30 |     label_selector_list  = label_selector.split(",")
 31 |     nodes = []
 32 |     for label_selector in label_selector_list: 
 33 |         nodes.extend(kubecli.list_killable_nodes(label_selector))
 34 |     if not nodes:
 35 |         raise Exception("Ready nodes with the provided label selector do not exist")
 36 |     logging.info("Ready nodes with the label selector %s: %s" % (label_selector_list, nodes))
 37 |     number_of_nodes = len(nodes)
 38 |     if instance_kill_count == number_of_nodes:
 39 |         return nodes
 40 |     nodes_to_return = []
 41 |     for i in range(instance_kill_count):
 42 |         node_to_add = nodes[random.randint(0, len(nodes) - 1)]
 43 |         nodes_to_return.append(node_to_add)
 44 |         nodes.remove(node_to_add)
 45 |     return nodes_to_return
 46 | 
 47 | # krkn_lib
 48 | # Wait until the node status becomes Ready
 49 | def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
 50 |     affected_node =  kubecli.watch_node_status(node, "True", timeout, affected_node)
 51 |     return affected_node
 52 |    
 53 | 
 54 | # krkn_lib
 55 | # Wait until the node status becomes Not Ready
 56 | def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
 57 |     affected_node = kubecli.watch_node_status(node, "False", timeout, affected_node)
 58 |     return affected_node
 59 |     
 60 | 
 61 | # krkn_lib
 62 | # Wait until the node status becomes Unknown
 63 | def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
 64 |     affected_node = kubecli.watch_node_status(node, "Unknown", timeout, affected_node)
 65 |     return affected_node
 66 | 
 67 | 
 68 | # Get the ip of the cluster node
 69 | def get_node_ip(node):
 70 |     return runcommand.invoke(
 71 |         "kubectl get node %s -o "
 72 |         "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node)
 73 |     )
 74 | 
 75 | 
 76 | def check_service_status(node, service, ssh_private_key, timeout):
 77 |     ssh = paramiko.SSHClient()
 78 |     ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
 79 |     i = 0
 80 |     sleeper = 1
 81 |     while i <= timeout:
 82 |         try:
 83 |             time.sleep(sleeper)
 84 |             i += sleeper
 85 |             logging.info("Trying to ssh to instance: %s" % (node))
 86 |             connection = ssh.connect(
 87 |                 node,
 88 |                 username="root",
 89 |                 key_filename=ssh_private_key,
 90 |                 timeout=800,
 91 |                 banner_timeout=400,
 92 |             )
 93 |             if connection is None:
 94 |                 break
 95 |         except Exception as e:
 96 |             logging.error(
 97 |                 "Failed to ssh to instance: %s within the timeout duration of %s: %s"
 98 |                 % (node, timeout, e)
 99 |             )
100 | 
101 |     for service_name in service:
102 |         logging.info("Checking status of Service: %s" % (service_name))
103 |         stdin, stdout, stderr = ssh.exec_command(
104 |             "systemctl status %s  | grep '^   Active' "
105 |             "|  awk '{print $2}'" % (service_name)
106 |         )
107 |         service_status = stdout.readlines()[0]
108 |         logging.info(
109 |             "Status of service %s is %s \n" % (service_name, service_status.strip())
110 |         )
111 |         if service_status.strip() != "active":
112 |             logging.error(
113 |                 "Service %s is in %s state" % (service_name, service_status.strip())
114 |             )
115 |     ssh.close()
116 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
 3 |     abstract_node_scenarios,
 4 | )
 5 | from krkn_lib.k8s import KrknKubernetes
 6 | from krkn_lib.models.k8s import AffectedNodeStatus
 7 | 
 8 | class GENERAL:
 9 |     def __init__(self):
10 |         pass
11 | 
12 | 
13 | # krkn_lib
14 | class general_node_scenarios(abstract_node_scenarios):
15 |     def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
16 |         super().__init__(kubecli, affected_nodes_status)
17 |         self.general = GENERAL()
18 | 
19 |     # Node scenario to start the node
20 |     def node_start_scenario(self, instance_kill_count, node, timeout):
21 |         logging.info(
22 |             "Node start is not set up yet for this cloud type, "
23 |             "no action is going to be taken"
24 |         )
25 | 
26 |     # Node scenario to stop the node
27 |     def node_stop_scenario(self, instance_kill_count, node, timeout):
28 |         logging.info(
29 |             "Node stop is not set up yet for this cloud type,"
30 |             " no action is going to be taken"
31 |         )
32 | 
33 |     # Node scenario to terminate the node
34 |     def node_termination_scenario(self, instance_kill_count, node, timeout):
35 |         logging.info(
36 |             "Node termination is not set up yet for this cloud type, "
37 |             "no action is going to be taken"
38 |         )
39 | 
40 |     # Node scenario to reboot the node
41 |     def node_reboot_scenario(self, instance_kill_count, node, timeout):
42 |         logging.info(
43 |             "Node reboot is not set up yet for this cloud type,"
44 |             " no action is going to be taken"
45 |         )
46 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/pvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/pvc/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/service_disruption/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/service_disruption/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/service_hijacking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/service_hijacking/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | import yaml
  5 | from krkn_lib.models.telemetry import ScenarioTelemetry
  6 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
  7 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
  8 | 
  9 | 
 10 | class ServiceHijackingScenarioPlugin(AbstractScenarioPlugin):
 11 |     def run(
 12 |         self,
 13 |         run_uuid: str,
 14 |         scenario: str,
 15 |         krkn_config: dict[str, any],
 16 |         lib_telemetry: KrknTelemetryOpenshift,
 17 |         scenario_telemetry: ScenarioTelemetry,
 18 |     ) -> int:
 19 |         with open(scenario) as stream:
 20 |             scenario_config = yaml.safe_load(stream)
 21 | 
 22 |         service_name = scenario_config["service_name"]
 23 |         service_namespace = scenario_config["service_namespace"]
 24 |         plan = scenario_config["plan"]
 25 |         image = scenario_config["image"]
 26 |         target_port = scenario_config["service_target_port"]
 27 |         chaos_duration = scenario_config["chaos_duration"]
 28 | 
 29 |         logging.info(
 30 |             f"checking service {service_name} in namespace: {service_namespace}"
 31 |         )
 32 |         if not lib_telemetry.get_lib_kubernetes().service_exists(
 33 |             service_name, service_namespace
 34 |         ):
 35 |             logging.error(
 36 |                 f"ServiceHijackingScenarioPlugin service: {service_name} not found in namespace: {service_namespace}, failed to run scenario."
 37 |             )
 38 |             return 1
 39 |         try:
 40 |             logging.info(
 41 |                 f"service: {service_name} found in namespace: {service_namespace}"
 42 |             )
 43 |             logging.info(f"creating webservice and initializing test plan...")
 44 |             # both named ports and port numbers can be used
 45 |             if isinstance(target_port, int):
 46 |                 logging.info(f"webservice will listen on port {target_port}")
 47 |                 webservice = (
 48 |                     lib_telemetry.get_lib_kubernetes().deploy_service_hijacking(
 49 |                         service_namespace, plan, image, port_number=target_port
 50 |                     )
 51 |                 )
 52 |             else:
 53 |                 logging.info(f"traffic will be redirected to named port: {target_port}")
 54 |                 webservice = (
 55 |                     lib_telemetry.get_lib_kubernetes().deploy_service_hijacking(
 56 |                         service_namespace, plan, image, port_name=target_port
 57 |                     )
 58 |                 )
 59 |             logging.info(
 60 |                 f"successfully deployed pod: {webservice.pod_name} "
 61 |                 f"in namespace:{service_namespace} with selector {webservice.selector}!"
 62 |             )
 63 |             logging.info(
 64 |                 f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}"
 65 |             )
 66 |             original_service = (
 67 |                 lib_telemetry.get_lib_kubernetes().replace_service_selector(
 68 |                     [webservice.selector], service_name, service_namespace
 69 |                 )
 70 |             )
 71 |             if original_service is None:
 72 |                 logging.error(
 73 |                     f"ServiceHijackingScenarioPlugin failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}"
 74 |                 )
 75 |                 return 1
 76 | 
 77 |             logging.info(f"service: {service_name} successfully patched!")
 78 |             logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}")
 79 |             logging.info(f"waiting {chaos_duration} before restoring the service")
 80 |             time.sleep(chaos_duration)
 81 |             selectors = [
 82 |                 "=".join([key, original_service["spec"]["selector"][key]])
 83 |                 for key in original_service["spec"]["selector"].keys()
 84 |             ]
 85 |             logging.info(f"restoring the service selectors {selectors}")
 86 |             original_service = (
 87 |                 lib_telemetry.get_lib_kubernetes().replace_service_selector(
 88 |                     selectors, service_name, service_namespace
 89 |                 )
 90 |             )
 91 |             if original_service is None:
 92 |                 logging.error(
 93 |                     f"ServiceHijackingScenarioPlugin failed to restore original "
 94 |                     f"service: {service_name}, namespace: {service_namespace} with selectors: {selectors}"
 95 |                 )
 96 |                 return 1
 97 |             logging.info("selectors successfully restored")
 98 |             logging.info("undeploying service-hijacking resources...")
 99 |             lib_telemetry.get_lib_kubernetes().undeploy_service_hijacking(webservice)
100 |             return 0
101 |         except Exception as e:
102 |             logging.error(
103 |                 f"ServiceHijackingScenarioPlugin scenario {scenario} failed with exception: {e}"
104 |             )
105 |             return 1
106 | 
107 |     def get_scenario_types(self) -> list[str]:
108 |         return ["service_hijacking_scenarios"]
109 | 


--------------------------------------------------------------------------------
/krkn/scenario_plugins/shut_down/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/shut_down/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/syn_flood/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/syn_flood/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/time_actions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/time_actions/__init__.py


--------------------------------------------------------------------------------
/krkn/scenario_plugins/zone_outage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/scenario_plugins/zone_outage/__init__.py


--------------------------------------------------------------------------------
/krkn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/krkn/tests/__init__.py


--------------------------------------------------------------------------------
/krkn/tests/test_classes/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | 
 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 7 | 
 8 | 
 9 | class WrongModuleScenarioPlugin(AbstractScenarioPlugin):
10 |     def run(
11 |         self,
12 |         run_uuid: str,
13 |         scenario: str,
14 |         krkn_config: dict[str, any],
15 |         lib_telemetry: KrknTelemetryOpenshift,
16 |         scenario_telemetry: ScenarioTelemetry,
17 |     ) -> int:
18 |         pass
19 | 
20 |     def get_scenario_types(self) -> list[str]:
21 |         pass
22 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/correct_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | 
 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 7 | 
 8 | 
 9 | class CorrectScenarioPlugin(AbstractScenarioPlugin):
10 | 
11 |     def run(
12 |         self,
13 |         run_uuid: str,
14 |         scenario: str,
15 |         krkn_config: dict[str, any],
16 |         lib_telemetry: KrknTelemetryOpenshift,
17 |         scenario_telemetry: ScenarioTelemetry,
18 |     ) -> int:
19 |         pass
20 | 
21 |     def get_scenario_types(self) -> list[str]:
22 |         return ["correct_scenarios", "scenarios_correct"]
23 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/duplicated_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from krkn_lib.models.telemetry import ScenarioTelemetry
 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 3 | 
 4 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 5 | 
 6 | 
 7 | class DuplicatedScenarioPlugin(AbstractScenarioPlugin):
 8 | 
 9 |     def run(
10 |         self,
11 |         run_uuid: str,
12 |         scenario: str,
13 |         krkn_config: dict[str, any],
14 |         lib_telemetry: KrknTelemetryOpenshift,
15 |         scenario_telemetry: ScenarioTelemetry,
16 |     ) -> int:
17 |         pass
18 | 
19 |     def get_scenario_types(self) -> list[str]:
20 |         return ["another_irrelevant_scenario", "duplicated_scenario"]
21 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/duplicated_two_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from krkn_lib.models.telemetry import ScenarioTelemetry
 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 3 | 
 4 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 5 | 
 6 | 
 7 | class DuplicatedTwoScenarioPlugin(AbstractScenarioPlugin):
 8 | 
 9 |     def run(
10 |         self,
11 |         run_uuid: str,
12 |         scenario: str,
13 |         krkn_config: dict[str, any],
14 |         lib_telemetry: KrknTelemetryOpenshift,
15 |         scenario_telemetry: ScenarioTelemetry,
16 |     ) -> int:
17 |         pass
18 | 
19 |     def get_scenario_types(self) -> list[str]:
20 |         return ["duplicated_scenario", "irellevant_scenario"]
21 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/example_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from krkn_lib.models.telemetry import ScenarioTelemetry
 2 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 3 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 4 | 
 5 | 
 6 | # Each plugin must extend the AbstractScenarioPlugin abstract class
 7 | # and implement its methods. Also the naming conventions must be respected
 8 | # you can refer to the documentation for the details:
 9 | # https://github.com/krkn-chaos/krkn/blob/main/docs/scenario_plugin_api.md
10 | class ExampleScenarioPlugin(AbstractScenarioPlugin):
11 | 
12 |     def run(
13 |         self,
14 |         run_uuid: str,
15 |         scenario: str,
16 |         krkn_config: dict[str, any],
17 |         lib_telemetry: KrknTelemetryOpenshift,
18 |         scenario_telemetry: ScenarioTelemetry,
19 |     ) -> int:
20 |         """
21 |         :param run_uuid: the uuid of the chaos run generated by krkn for every single run
22 |         :param scenario: the config file of the scenario that is currently executed
23 |         :param krkn_config: the full dictionary representation of the `config.yaml`
24 |         :param lib_telemetry: it is a composite object of all the
25 |         [krkn-lib](https://krkn-chaos.github.io/krkn-lib-docs/modules.html)
26 |         objects and methods needed by a krkn plugin to run.
27 |         :param scenario_telemetry: the `ScenarioTelemetry` object of the scenario that is currently executed
28 |         """
29 | 
30 |         pass
31 | 
32 |         try:
33 |             # The scenario logic for each scenario must be placed
34 |             # here. A try-except it is needed to catch exceptions
35 |             # that may occur in this section and they shouldn't
36 |             # be propagated outside (only int return value is admitted).
37 | 
38 |             # krkn-lib KrknKubernetes object containing all the kubernetes primitives
39 |             # can be retrieved by the KrknTelemetryOpenshift object
40 |             krkn_kubernetes = lib_telemetry.get_lib_kubernetes()
41 | 
42 |             # krkn-lib KrknOpenshift object containing all the OCP primitives
43 |             # can be retrieved by the KrknTelemetryOpenshift object
44 |             krkn_openshift = lib_telemetry.get_lib_ocp()
45 | 
46 |             # if the scenario succeeds the telemetry exit status is 0
47 |             return 0
48 |         except Exception as e:
49 |             # if the scenario fails the telemetry exit status is 1
50 |             return 1
51 | 
52 |     # Reflects the scenario type defined in the config.yaml
53 |     # in the chaos_scenarios section and to which each class
54 |     # responds.
55 |     def get_scenario_types(self) -> list[str]:
56 |         return ["example_scenarios"]
57 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/snake_case_mismatch_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | 
 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 7 | 
 8 | 
 9 | class SnakeMismatchScenarioPlugin(AbstractScenarioPlugin):
10 | 
11 |     def run(
12 |         self,
13 |         run_uuid: str,
14 |         scenario: str,
15 |         krkn_config: dict[str, any],
16 |         lib_telemetry: KrknTelemetryOpenshift,
17 |         scenario_telemetry: ScenarioTelemetry,
18 |     ) -> int:
19 |         pass
20 | 
21 |     def get_scenario_types(self) -> list[str]:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/wrong_classname_scenario_plugin.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | 
 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 7 | 
 8 | 
 9 | class WrongClassNamePlugin(AbstractScenarioPlugin):
10 | 
11 |     def run(
12 |         self,
13 |         run_uuid: str,
14 |         scenario: str,
15 |         krkn_config: dict[str, any],
16 |         lib_telemetry: KrknTelemetryOpenshift,
17 |         scenario_telemetry: ScenarioTelemetry,
18 |     ) -> int:
19 |         pass
20 | 
21 |     def get_scenario_types(self) -> list[str]:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/krkn/tests/test_classes/wrong_module.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | 
 6 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 7 | 
 8 | 
 9 | class WrongModuleScenarioPlugin(AbstractScenarioPlugin):
10 | 
11 |     def run(
12 |         self,
13 |         run_uuid: str,
14 |         scenario: str,
15 |         krkn_config: dict[str, any],
16 |         lib_telemetry: KrknTelemetryOpenshift,
17 |         scenario_telemetry: ScenarioTelemetry,
18 |     ) -> int:
19 |         pass
20 | 
21 |     def get_scenario_types(self) -> list[str]:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/krkn/tests/test_plugin_factory.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
  4 | from krkn.scenario_plugins.scenario_plugin_factory import ScenarioPluginFactory
  5 | from krkn.tests.test_classes.correct_scenario_plugin import (
  6 |     CorrectScenarioPlugin,
  7 | )
  8 | 
  9 | 
 10 | class TestPluginFactory(unittest.TestCase):
 11 | 
 12 |     def test_plugin_factory(self):
 13 |         factory = ScenarioPluginFactory("krkn.tests.test_classes")
 14 |         self.assertEqual(len(factory.loaded_plugins), 5)
 15 |         self.assertEqual(len(factory.failed_plugins), 4)
 16 |         self.assertIs(
 17 |             factory.loaded_plugins["correct_scenarios"].__base__,
 18 |             AbstractScenarioPlugin,
 19 |         )
 20 |         self.assertTrue(
 21 |             isinstance(
 22 |                 factory.loaded_plugins["correct_scenarios"](), CorrectScenarioPlugin
 23 |             )
 24 |         )
 25 |         # soLid
 26 |         self.assertTrue(
 27 |             isinstance(
 28 |                 factory.loaded_plugins["correct_scenarios"](), AbstractScenarioPlugin
 29 |             )
 30 |         )
 31 | 
 32 |         self.assertTrue(
 33 |             "krkn.tests.test_classes.snake_case_mismatch_scenario_plugin"
 34 |             in [p[0] for p in factory.failed_plugins]
 35 |         )
 36 |         self.assertTrue(
 37 |             "krkn.tests.test_classes.wrong_classname_scenario_plugin"
 38 |             in [p[0] for p in factory.failed_plugins]
 39 |         )
 40 |         self.assertTrue(
 41 |             "krkn.tests.test_classes.wrong_module"
 42 |             in [p[0] for p in factory.failed_plugins]
 43 |         )
 44 | 
 45 |     def test_plugin_factory_naming_convention(self):
 46 |         factory = ScenarioPluginFactory()
 47 |         correct_module_name = "krkn.scenario_plugins.example.correct_scenario_plugin"
 48 |         correct_class_name = "CorrectScenarioPlugin"
 49 |         correct_class_name_no_match = "NoMatchScenarioPlugin"
 50 |         wrong_module_name = "krkn.scenario_plugins.example.correct_plugin"
 51 |         wrong_class_name = "WrongScenario"
 52 |         wrong_folder_name_plugin = (
 53 |             "krkn.scenario_plugins.example_plugin.example_plugin_scenario_plugin"
 54 |         )
 55 |         wrong_folder_name_plugin_class_name = "ExamplePluginScenarioPlugin"
 56 |         wrong_folder_name_scenario = (
 57 |             "krkn.scenario_plugins.example_scenario.example_scenario_scenario_plugin"
 58 |         )
 59 |         wrong_folder_name_scenario_class_name = "ExampleScenarioScenarioPlugin"
 60 | 
 61 |         result, message = factory.is_naming_convention_correct(
 62 |             correct_module_name, correct_class_name
 63 |         )
 64 |         self.assertTrue(result)
 65 |         self.assertIsNone(message)
 66 | 
 67 |         result, message = factory.is_naming_convention_correct(
 68 |             wrong_module_name, correct_class_name
 69 |         )
 70 |         self.assertFalse(result)
 71 |         self.assertEqual(
 72 |             message,
 73 |             "scenario plugin module file names must end with `_scenario_plugin` suffix",
 74 |         )
 75 | 
 76 |         result, message = factory.is_naming_convention_correct(
 77 |             correct_module_name, wrong_class_name
 78 |         )
 79 |         self.assertFalse(result)
 80 |         self.assertEqual(
 81 |             message,
 82 |             "scenario plugin class name must start with a capital letter, "
 83 |             "end with `ScenarioPlugin`, and cannot be just `ScenarioPlugin`.",
 84 |         )
 85 | 
 86 |         result, message = factory.is_naming_convention_correct(
 87 |             correct_module_name, correct_class_name_no_match
 88 |         )
 89 |         self.assertFalse(result)
 90 |         self.assertEqual(
 91 |             message,
 92 |             "module file name must in snake case must match class name in capital camel case "
 93 |             "e.g. `example_scenario_plugin` -> `ExampleScenarioPlugin`",
 94 |         )
 95 | 
 96 |         result, message = factory.is_naming_convention_correct(
 97 |             wrong_folder_name_plugin, wrong_folder_name_plugin_class_name
 98 |         )
 99 |         self.assertFalse(result)
100 |         self.assertEqual(
101 |             message, "scenario plugin folder cannot contain `scenario` or `plugin` word"
102 |         )
103 | 
104 |         result, message = factory.is_naming_convention_correct(
105 |             wrong_folder_name_scenario, wrong_folder_name_scenario_class_name
106 |         )
107 |         self.assertFalse(result)
108 |         self.assertEqual(
109 |             message, "scenario plugin folder cannot contain `scenario` or `plugin` word"
110 |         )
111 | 


--------------------------------------------------------------------------------
/krkn/utils/HealthChecker.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | import logging
 4 | import queue
 5 | from datetime import datetime
 6 | from krkn_lib.models.telemetry.models import HealthCheck
 7 | 
 8 | class HealthChecker:
 9 |     current_iterations: int = 0
10 |     ret_value = 0
11 |     def __init__(self, iterations):
12 |         self.iterations = iterations
13 | 
14 |     def make_request(self, url, auth=None, headers=None, verify=True):
15 |         response_data = {}
16 |         response = requests.get(url, auth=auth, headers=headers, verify=verify)
17 |         response_data["url"] = url
18 |         response_data["status"] = response.status_code == 200
19 |         response_data["status_code"] = response.status_code
20 |         return response_data
21 | 
22 | 
23 |     def run_health_check(self, health_check_config, health_check_telemetry_queue: queue.Queue):
24 |         if health_check_config and health_check_config["config"] and any(config.get("url") for config in health_check_config["config"]):
25 |             health_check_start_time_stamp = datetime.now()
26 |             health_check_telemetry = []
27 |             health_check_tracker = {}
28 |             interval = health_check_config["interval"] if health_check_config["interval"] else 2
29 |             
30 |             response_tracker = {config["url"]:True for config in health_check_config["config"]}
31 |             while self.current_iterations < self.iterations:
32 |                 for config in health_check_config.get("config"):
33 |                     auth, headers = None, None
34 |                     verify_url = config["verify_url"] if "verify_url" in config else True
35 |                     if config["url"]: url = config["url"]
36 | 
37 |                     if config["bearer_token"]:
38 |                         bearer_token = "Bearer " + config["bearer_token"]
39 |                         headers = {"Authorization": bearer_token}
40 | 
41 |                     if config["auth"]: auth = tuple(config["auth"].split(','))
42 |                     response = self.make_request(url, auth, headers, verify_url)
43 | 
44 |                     if response["status_code"] != 200:
45 |                         if config["url"] not in health_check_tracker:
46 |                             start_timestamp = datetime.now()
47 |                             health_check_tracker[config["url"]] = {
48 |                                 "status_code": response["status_code"],
49 |                                 "start_timestamp": start_timestamp
50 |                             }
51 |                             if response_tracker[config["url"]] != False: response_tracker[config["url"]] = False
52 |                             if config["exit_on_failure"] and config["exit_on_failure"] == True and self.ret_value==0: self.ret_value = 2
53 |                     else:
54 |                         if config["url"] in health_check_tracker:
55 |                             end_timestamp = datetime.now()
56 |                             start_timestamp = health_check_tracker[config["url"]]["start_timestamp"]
57 |                             previous_status_code = str(health_check_tracker[config["url"]]["status_code"])
58 |                             duration = (end_timestamp - start_timestamp).total_seconds()
59 |                             downtime_record = {
60 |                                 "url": config["url"],
61 |                                 "status": False,
62 |                                 "status_code": previous_status_code,
63 |                                 "start_timestamp": start_timestamp.isoformat(),
64 |                                 "end_timestamp": end_timestamp.isoformat(),
65 |                                 "duration": duration
66 |                             }
67 |                             health_check_telemetry.append(HealthCheck(downtime_record))
68 |                             del health_check_tracker[config["url"]]
69 |                     time.sleep(interval)
70 |             health_check_end_time_stamp = datetime.now()
71 |             for url, status in response_tracker.items():
72 |                 if status == True:
73 |                     duration = (health_check_end_time_stamp - health_check_start_time_stamp).total_seconds()
74 |                     success_response = {
75 |                         "url": url,
76 |                         "status": True,
77 |                         "status_code": 200,
78 |                         "start_timestamp": health_check_start_time_stamp.isoformat(),
79 |                         "end_timestamp": health_check_end_time_stamp.isoformat(),
80 |                         "duration": duration
81 |                     }
82 |                     health_check_telemetry.append(HealthCheck(success_response))
83 |             health_check_telemetry_queue.put(health_check_telemetry)
84 |         else:
85 |             logging.info("health checks config is not defined, skipping them")


--------------------------------------------------------------------------------
/krkn/utils/TeeLogHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | class TeeLogHandler(logging.Handler):
 3 |     logs: list[str] = []
 4 |     name = "TeeLogHandler"
 5 | 
 6 |     def get_output(self) -> str:
 7 |         return "\n".join(self.logs)
 8 | 
 9 |     def emit(self, record):
10 |         self.logs.append(self.formatter.format(record))
11 |     def __del__(self):
12 |         pass


--------------------------------------------------------------------------------
/krkn/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .TeeLogHandler import TeeLogHandler
2 | from .functions import *
3 | 


--------------------------------------------------------------------------------
/krkn/utils/functions.py:
--------------------------------------------------------------------------------
 1 | import krkn_lib.utils
 2 | from krkn_lib.k8s import KrknKubernetes
 3 | from krkn_lib.models.telemetry import ScenarioTelemetry
 4 | from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 5 | from tzlocal.unix import get_localzone
 6 | import logging
 7 | 
 8 | def populate_cluster_events(
 9 |     krkn_config: dict,
10 |     scenario_config: dict,
11 |     kubecli: KrknKubernetes,
12 |     start_timestamp: int,
13 |     end_timestamp: int,
14 | ):
15 |     events = []
16 |     namespaces = __retrieve_namespaces(scenario_config, kubecli)
17 | 
18 |     if len(namespaces) == 0:
19 |         events.extend(
20 |             kubecli.collect_and_parse_cluster_events(
21 |                 start_timestamp, end_timestamp, str(get_localzone())
22 |             )
23 |         )
24 |     else:
25 |         for namespace in namespaces:
26 |             events.extend(
27 |                 kubecli.collect_and_parse_cluster_events(
28 |                     start_timestamp,
29 |                     end_timestamp,
30 |                     str(get_localzone()),
31 |                     namespace=namespace,
32 |                 )
33 |             )
34 |     archive_path = krkn_config["telemetry"]["archive_path"]
35 |     file_path = archive_path + "/events.json"
36 |     with open(file_path, "w+") as f:
37 |         f.write("\n".join(str(item) for item in events))
38 |     logging.info(f'Find cluster events in file {file_path}' )
39 |     
40 | 
41 | 
42 | def collect_and_put_ocp_logs(
43 |     telemetry_ocp: KrknTelemetryOpenshift,
44 |     scenario_config: dict,
45 |     request_id: str,
46 |     start_timestamp: int,
47 |     end_timestamp: int,
48 | ):
49 |     if (
50 |         telemetry_ocp.get_telemetry_config()
51 |         and telemetry_ocp.get_telemetry_config()["enabled"]
52 |         and telemetry_ocp.get_telemetry_config()["logs_backup"]
53 |         and not telemetry_ocp.get_lib_kubernetes().is_kubernetes()
54 |     ):
55 |         namespaces = __retrieve_namespaces(
56 |             scenario_config, telemetry_ocp.get_lib_kubernetes()
57 |         )
58 |         if len(namespaces) > 0:
59 |             for namespace in namespaces:
60 |                 telemetry_ocp.put_ocp_logs(
61 |                     request_id,
62 |                     telemetry_ocp.get_telemetry_config(),
63 |                     start_timestamp,
64 |                     end_timestamp,
65 |                     namespace,
66 |                 )
67 |         else:
68 |             telemetry_ocp.put_ocp_logs(
69 |                 request_id,
70 |                 telemetry_ocp.get_telemetry_config(),
71 |                 start_timestamp,
72 |                 end_timestamp,
73 |             )
74 | 
75 | 
76 | def __retrieve_namespaces(scenario_config: dict, kubecli: KrknKubernetes) -> set[str]:
77 |     namespaces = list()
78 |     namespaces.extend(krkn_lib.utils.deep_get_attribute("namespace", scenario_config))
79 |     namespace_patterns = krkn_lib.utils.deep_get_attribute(
80 |         "namespace_pattern", scenario_config
81 |     )
82 |     for pattern in namespace_patterns:
83 |         namespaces.extend(kubecli.list_namespaces_by_regex(pattern))
84 |     return set(namespaces)
85 | 


--------------------------------------------------------------------------------
/media/KrakenStarting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/KrakenStarting.png


--------------------------------------------------------------------------------
/media/kraken-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/kraken-workflow.png


--------------------------------------------------------------------------------
/media/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/media/logo.png


--------------------------------------------------------------------------------
/rbac/non-privileged-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: krkn-non-privileged-role
 5 |   namespace: target-namespace
 6 | rules:
 7 | - apiGroups: [""]
 8 |   resources: ["pods", "services"]
 9 |   verbs: ["get", "list", "watch", "create", "delete"]
10 | - apiGroups: ["apps"]
11 |   resources: ["deployments", "statefulsets"]
12 |   verbs: ["get", "list", "watch", "create", "delete"]
13 | - apiGroups: ["batch"]
14 |   resources: ["jobs"]
15 |   verbs: ["get", "list", "watch", "create", "delete"]
16 | 


--------------------------------------------------------------------------------
/rbac/non-privileged-rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: krkn-non-privileged-rolebinding
 5 |   namespace: target-namespace
 6 | subjects:
 7 | - kind: ServiceAccount
 8 |   name: krkn-sa
 9 |   namespace: target-namespace
10 | roleRef:
11 |   kind: Role
12 |   name: krkn-non-privileged-role
13 |   apiGroup: rbac.authorization.k8s.io
14 | 


--------------------------------------------------------------------------------
/rbac/privileged-clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: krkn-privileged-clusterrole
 5 | rules:
 6 | - apiGroups: [""]
 7 |   resources: ["nodes"]
 8 |   verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
 9 | - apiGroups: [""]
10 |   resources: ["pods", "services"]
11 |   verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
12 | - apiGroups: ["apps"]
13 |   resources: ["deployments", "statefulsets"]
14 |   verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
15 | - apiGroups: ["batch"]
16 |   resources: ["jobs"]
17 |   verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
18 | 


--------------------------------------------------------------------------------
/rbac/privileged-clusterrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: krkn-privileged-clusterrolebinding
 5 | subjects:
 6 | - kind: ServiceAccount
 7 |   name: krkn-sa
 8 |   namespace: krkn-namespace
 9 | roleRef:
10 |   kind: ClusterRole
11 |   name: krkn-privileged-clusterrole
12 |   apiGroup: rbac.authorization.k8s.io
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aliyun-python-sdk-core==2.13.36
 2 | aliyun-python-sdk-ecs==4.24.25
 3 | arcaflow-plugin-sdk==0.14.0
 4 | boto3==1.28.61
 5 | azure-identity==1.16.1
 6 | azure-keyvault==4.2.0
 7 | azure-mgmt-compute==30.5.0
 8 | azure-mgmt-network==27.0.0
 9 | itsdangerous==2.0.1
10 | coverage==7.6.12
11 | datetime==5.4
12 | docker==7.0.0
13 | gitpython==3.1.41
14 | google-auth==2.37.0
15 | google-cloud-compute==1.22.0
16 | ibm_cloud_sdk_core==3.18.0
17 | ibm_vpc==0.20.0
18 | jinja2==3.1.6
19 | krkn-lib==5.0.1
20 | lxml==5.1.0
21 | kubernetes==28.1.0
22 | numpy==1.26.4
23 | pandas==2.2.0
24 | openshift-client==1.0.21
25 | paramiko==3.4.0
26 | pyVmomi==8.0.2.0.1
27 | pyfiglet==1.0.2
28 | pytest==8.0.0
29 | python-ipmi==0.5.4
30 | python-openstackclient==6.5.0
31 | requests==2.32.2
32 | service_identity==24.1.0
33 | PyYAML==6.0.1
34 | setuptools==78.1.1
35 | werkzeug==3.0.6
36 | wheel==0.42.0
37 | zope.interface==5.4.0
38 | 
39 | 
40 | git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@v0.1.0
41 | git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
42 | cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
43 | 


--------------------------------------------------------------------------------
/scenarios/kind/node_scenarios_example.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:                                                        # node chaos scenarios to be injected
 3 |     - node_stop_start_scenario
 4 |     node_name: kind-worker                                          # node on which scenario has to be injected; can set multiple names separated by comma
 5 |     # label_selector: node-role.kubernetes.io/worker                # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
 6 |     instance_count: 1                                               # Number of nodes to perform action/select that match the label selector
 7 |     runs: 1                                                         # number of times to inject each scenario under actions (will perform on same node each time)
 8 |     timeout: 120                                                    # duration to wait for completion of node scenario injection
 9 |     cloud_type: docker                                                # cloud type on which Kubernetes/OpenShift runs
10 |   - actions:
11 |     - node_reboot_scenario
12 |     node_name: kind-worker
13 |     # label_selector: node-role.kubernetes.io/infra
14 |     instance_count: 1
15 |     timeout: 120
16 |     cloud_type: docker
17 | 


--------------------------------------------------------------------------------
/scenarios/kind/scheduler.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^kube-system$
5 |     label_selector: component=kube-scheduler
6 |     krkn_pod_recovery_time: 120
7 | 


--------------------------------------------------------------------------------
/scenarios/kube/container_dns.yml:
--------------------------------------------------------------------------------
1 | scenarios:
2 | - name: "kill dns container"
3 |   namespace: "kube-system"
4 |   label_selector: "k8s-app=kube-dns"
5 |   container_name: ""
6 |   action: 1
7 |   count: 1
8 |   retry_wait: 60
9 | 


--------------------------------------------------------------------------------
/scenarios/kube/cpu-hog.yml:
--------------------------------------------------------------------------------
 1 | duration: 60
 2 | workers: '' # leave it empty '' node cpu auto-detection
 3 | hog-type: cpu
 4 | image: quay.io/krkn-chaos/krkn-hog
 5 | namespace: default
 6 | cpu-load-percentage: 90
 7 | cpu-method: all
 8 | node-selector: "node-role.kubernetes.io/worker="
 9 | number-of-nodes: 2
10 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]
11 | 


--------------------------------------------------------------------------------
/scenarios/kube/io-hog.yml:
--------------------------------------------------------------------------------
 1 | duration: 30
 2 | workers: '' # leave it empty '' node cpu auto-detection
 3 | hog-type: io
 4 | image: quay.io/krkn-chaos/krkn-hog
 5 | namespace: default
 6 | io-block-size: 1m
 7 | io-write-bytes: 1g
 8 | io-target-pod-folder: /hog-data
 9 | io-target-pod-volume:
10 |   name: node-volume
11 |   hostPath:
12 |     path: /root # a path writable by kubelet in the root filesystem of the node
13 | node-selector: "node-role.kubernetes.io/worker="
14 | number-of-nodes: ''
15 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]


--------------------------------------------------------------------------------
/scenarios/kube/managedcluster_scenarios_example.yml:
--------------------------------------------------------------------------------
 1 | managedcluster_scenarios:
 2 |   - actions:                                                        # ManagedCluster chaos scenarios to be injected
 3 |     - managedcluster_stop_start_scenario
 4 |     managedcluster_name: cluster1                                   # ManagedCluster on which scenario has to be injected; can set multiple names separated by comma
 5 |     # label_selector:                                               # When managedcluster_name is not specified, a ManagedCluster with matching label_selector is selected for ManagedCluster chaos scenario injection
 6 |     instance_count: 1                                               # Number of managedcluster to perform action/select that match the label selector
 7 |     runs: 1                                                         # Number of times to inject each scenario under actions (will perform on same ManagedCluster each time)
 8 |     timeout: 420                                                    # Duration to wait for completion of ManagedCluster scenario injection
 9 |                                                                     # For OCM to detect a ManagedCluster as unavailable, have to wait 5*leaseDurationSeconds
10 |                                                                     # (default leaseDurationSeconds = 60 sec)
11 |   - actions:
12 |     - stop_start_klusterlet_scenario
13 |     managedcluster_name: cluster1
14 |     # label_selector:
15 |     instance_count: 1
16 |     runs: 1
17 |     timeout: 60


--------------------------------------------------------------------------------
/scenarios/kube/memory-hog.yml:
--------------------------------------------------------------------------------
 1 | duration: 60
 2 | workers: '' # leave it empty '' node cpu auto-detection
 3 | hog-type: memory
 4 | image: quay.io/krkn-chaos/krkn-hog
 5 | namespace: default
 6 | memory-vm-bytes: 90%
 7 | node-selector: "node-role.kubernetes.io/worker="
 8 | number-of-nodes: ''
 9 | taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]
10 | 


--------------------------------------------------------------------------------
/scenarios/kube/network-filter.yml:
--------------------------------------------------------------------------------
 1 | - id: node_network_filter
 2 |   wait_duration: 300
 3 |   test_duration: 100
 4 |   label_selector: "kubernetes.io/hostname=ip-10-0-39-182.us-east-2.compute.internal"
 5 |   namespace: 'default'
 6 |   instance_count: 1
 7 |   execution: parallel
 8 |   ingress: false
 9 |   egress: true
10 |   target: node
11 |   interfaces: []
12 |   ports:
13 |     - 2049


--------------------------------------------------------------------------------
/scenarios/kube/pod.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     name_pattern: ^nginx-.*$
5 |     namespace_pattern: ^default$
6 |     kill: 1
7 |     krkn_pod_recovery_time: 120
8 | 


--------------------------------------------------------------------------------
/scenarios/kube/scheduler.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^kube-system$
5 |     label_selector: k8s-app=kube-scheduler
6 |     krkn_pod_recovery_time: 120
7 | 


--------------------------------------------------------------------------------
/scenarios/kube/service_hijacking.yaml:
--------------------------------------------------------------------------------
 1 | # refer to the documentation for further infos https://github.com/krkn-chaos/krkn/blob/main/docs/service_hijacking.md
 2 | 
 3 | service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
 4 | service_name: nginx-service # name of the service to be hijacked
 5 | service_namespace: default # The namespace where the target service is located
 6 | image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
 7 | chaos_duration: 30 # Total duration of the chaos scenario in seconds.
 8 | plan:
 9 |   - resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored.
10 |                                 # For resources, only query parameters are captured.
11 | 
12 |     steps:                      # A time-based plan consisting of steps can be defined for each resource.
13 |       GET:                      # One or more HTTP methods can be specified for each step.
14 |                                 # Note: Non-standard methods are supported
15 |                                 # for fully custom web services (e.g., using NONEXISTENT instead of POST).
16 | 
17 |         - duration: 15          # Duration in seconds for this step before moving to the next one, if defined. Otherwise,
18 |                                 # this step will continue until the chaos scenario ends.
19 | 
20 |           status: 500           # HTTP status code to be returned in this step.
21 |           mime_type: "application/json" # MIME type of the response for this step.
22 |           payload: |            # The response payload for this step.
23 |             {
24 |               "status":"internal server error"
25 |             }
26 |         - duration: 15
27 |           status: 201
28 |           mime_type: "application/json"
29 |           payload: |
30 |             {
31 |               "status":"resource created"
32 |             }
33 |       POST:
34 |         - duration: 15
35 |           status: 401
36 |           mime_type: "application/json"
37 |           payload: |
38 |             {
39 |                "status": "unauthorized"
40 |             }
41 |         - duration: 15
42 |           status: 404
43 |           mime_type: "text/plain"
44 |           payload: "not found"
45 | 
46 |   - resource: "/patch"
47 |     steps:
48 |       PATCH:
49 |         - duration: 15
50 |           status: 201
51 |           mime_type: "text/plain"
52 |           payload: "resource patched"
53 |         - duration: 15
54 |           status: 400
55 |           mime_type: "text/plain"
56 |           payload: "bad request"


--------------------------------------------------------------------------------
/scenarios/kube/syn_flood.yaml:
--------------------------------------------------------------------------------
 1 | packet-size: 120 # hping3 packet size
 2 | window-size: 64 # hping 3 TCP window size
 3 | duration: 10 # chaos scenario duration
 4 | namespace: default # namespace where the target service(s) are deployed
 5 | target-service: elasticsearch # target service name (if set target-service-label must be empty)
 6 | target-port: 9200 # target service TCP port
 7 | target-service-label : "" # target service label, can be used to target multiple target at the same time
 8 |                           # if they have the same label set (if set target-service must be empty)
 9 | number-of-pods: 2 # number of attacker pod instantiated per each target
10 | image: quay.io/krkn-chaos/krkn-syn-flood:v1.0.0 # syn flood attacker container image
11 | attacker-nodes:                       # this will set the node affinity to schedule the attacker node. Per each node label selector
12 |     node-role.kubernetes.io/worker:   # can be specified multiple values in this way the kube scheduler will schedule the attacker pods
13 |       - ""                            # in the best way possible based on the provided labels. Multiple labels can be specified
14 |                                       # set empty value  `attacker-nodes: {}`  to let kubernetes schedule the pods
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/scenarios/openshift/app_outage.yaml:
--------------------------------------------------------------------------------
1 | application_outage:                                  # Scenario to create an outage of an application by blocking traffic
2 |   duration: 600                                      # Duration in seconds after which the routes will be accessible
3 |   namespace: <namespace-with-application>            # Namespace to target - all application routes will go inaccessible if pod selector is empty
4 |   pod_selector: {app: foo}                            # Pods to target
5 |   block: [Ingress, Egress]                           # It can be Ingress or Egress or Ingress, Egress
6 | 


--------------------------------------------------------------------------------
/scenarios/openshift/aws_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:                                                      # node chaos scenarios to be injected
 3 |     - node_stop_start_scenario
 4 |     node_name:                                                    # node on which scenario has to be injected; can set multiple names separated by comma
 5 |     label_selector: node-role.kubernetes.io/worker                # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection; can specify multiple by a comma separated list
 6 |     instance_count: 2                                             # Number of nodes to perform action/select that match the label selector
 7 |     runs: 1                                                       # number of times to inject each scenario under actions (will perform on same node each time)
 8 |     timeout: 360                                                  # duration to wait for completion of node scenario injection
 9 |     duration: 20                                                  # duration to stop the node before running the start action
10 |     cloud_type: aws                                               # cloud type on which Kubernetes/OpenShift runs  
11 |     parallel: true                                                # Run action on label or node name in parallel or sequential, defaults to sequential
12 |   - actions:
13 |     - node_reboot_scenario
14 |     node_name:
15 |     label_selector: node-role.kubernetes.io/infra
16 |     instance_count: 1
17 |     timeout: 120
18 |     cloud_type: aws
19 |   - actions:
20 |       - node_disk_detach_attach_scenario
21 |     node_name:
22 |     label_selector:
23 |     instance_count: 1
24 |     timeout: 120
25 |     cloud_type: aws


--------------------------------------------------------------------------------
/scenarios/openshift/azure_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:
 3 |     - node_reboot_scenario
 4 |     node_name:
 5 |     label_selector: node-role.kubernetes.io/infra
 6 |     instance_count: 1
 7 |     timeout: 120
 8 |     cloud_type: azure
 9 |   - actions:
10 |     - node_stop_start_scenario
11 |     node_name:
12 |     label_selector: node-role.kubernetes.io/infra
13 |     instance_count: 1
14 |     timeout: 360
15 |     duration: 120
16 |     cloud_type: azure
17 | 


--------------------------------------------------------------------------------
/scenarios/openshift/baremetal_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:                                                        # Node chaos scenarios to be injected.
 3 |     - node_stop_start_scenario
 4 |     node_name:                                                      # Node on which scenario has to be injected.
 5 |     label_selector: node-role.kubernetes.io/worker                  # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection.
 6 |     instance_count: 1                                               # Number of nodes to perform action/select that match the label selector.
 7 |     runs: 1                                                         # Number of times to inject each scenario under actions (will perform on same node each time).
 8 |     timeout: 360                                                    # Duration to wait for completion of node scenario injection.
 9 |     duration: 120                                                   # Duration to stop the node before running the start action
10 |     cloud_type: bm                                                  # Cloud type on which Kubernetes/OpenShift runs.
11 |     bmc_user: defaultuser                                           # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
12 |     bmc_password: defaultpass                                       # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
13 |     bmc_info:                                                       # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
14 |       node-1:                                                       # The node name for the baremetal machine
15 |         bmc_addr: mgmt-machine1.example.com                         # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh'.
16 |       node-2:
17 |         bmc_addr: mgmt-machine2.example.com
18 |         bmc_user: user                                              # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set.
19 |         bmc_password: pass                                          # The baremetal IPMI password. Overrides the default IPMI user specified above. Optional if the default is set
20 | 


--------------------------------------------------------------------------------
/scenarios/openshift/cluster_shut_down_scenario.yml:
--------------------------------------------------------------------------------
1 | cluster_shut_down_scenario:                          # Scenario to stop all the nodes for specified duration and restart the nodes
2 |   runs: 1                                            # Number of times to execute the cluster_shut_down scenario
3 |   shut_down_duration: 150                            # duration in seconds to shut down the cluster
4 |   cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs
5 |   timeout: 60                                        # Number of seconds to wait for each node to be stopped or running
6 | 


--------------------------------------------------------------------------------
/scenarios/openshift/container_etcd.yml:
--------------------------------------------------------------------------------
1 | scenarios:
2 | - name: "kill etcd container"
3 |   namespace: "openshift-etcd"
4 |   label_selector: "k8s-app=etcd"
5 |   container_name: "etcd"
6 |   action: 1
7 |   count: 1
8 |   expected_recovery_time: 120
9 | 


--------------------------------------------------------------------------------
/scenarios/openshift/customapp_pod.yaml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^acme-air$
5 |     name_pattern: .*
6 |     krkn_pod_recovery_time: 120


--------------------------------------------------------------------------------
/scenarios/openshift/etcd.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^openshift-etcd$
5 |     label_selector: k8s-app=etcd
6 |     krkn_pod_recovery_time: 120
7 | 


--------------------------------------------------------------------------------
/scenarios/openshift/gcp_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:
 3 |     - node_reboot_scenario
 4 |     node_name:
 5 |     label_selector: node-role.kubernetes.io/worker
 6 |     instance_count: 1
 7 |     timeout: 120
 8 |     cloud_type: gcp
 9 |   - actions:
10 |     - node_stop_start_scenario
11 |     node_name:
12 |     label_selector: node-role.kubernetes.io/worker
13 |     instance_count: 1
14 |     timeout: 360
15 |     duration: 120
16 |     cloud_type: gcp
17 | 


--------------------------------------------------------------------------------
/scenarios/openshift/ibmcloud_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:
 3 |     - node_stop_start_scenario
 4 |     node_name:
 5 |     label_selector: node-role.kubernetes.io/worker
 6 |     instance_count: 1
 7 |     timeout: 360
 8 |     duration: 120
 9 |     cloud_type: ibm
10 |   - actions:
11 |     - node_reboot_scenario
12 |     node_name:
13 |     label_selector: node-role.kubernetes.io/worker
14 |     instance_count: 1
15 |     timeout: 120
16 |     cloud_type: ibm


--------------------------------------------------------------------------------
/scenarios/openshift/ingress_namespace.yaml:
--------------------------------------------------------------------------------
1 | scenarios:
2 | - namespace: "^.*ingress.*$"
3 |   runs: 1
4 |   sleep: 15
5 |   wait_time: 300
6 | 


--------------------------------------------------------------------------------
/scenarios/openshift/network_chaos.yaml:
--------------------------------------------------------------------------------
 1 | network_chaos: # Scenario to create an outage by simulating random variations in the network.
 2 |   duration: 300 # seconds
 3 |   node_name: # node on which scenario has to be injected;
 4 |   label_selector: <label_selector> # when node_name is not specified, a node with matching label_selector is selected for running the scenario.
 5 |   instance_count: 1
 6 |   interfaces: # Interface name would be the Kernel host network interface name.
 7 |     - "<interface_name>"
 8 |   execution: serial
 9 |   egress:
10 |     latency: 50ms # 50ms
11 |     loss: 0.02 # percentage
12 | 


--------------------------------------------------------------------------------
/scenarios/openshift/network_chaos_ingress.yml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=../plugin.schema.json
 2 | - id: network_chaos
 3 |   config: 
 4 |     node_interface_name:                # Dictionary with key as node name(s) and value as a list of its interfaces to test
 5 |       <node_name_1>:
 6 |         - <interface-1>
 7 |     label_selector: <label_selector>    # When node_interface_name is not specified, nodes with matching label_selector is selected for node chaos scenario injection
 8 |     instance_count: <number>            # Number of nodes to perform action/select that match the label selector 
 9 |     kubeconfig_path: <path>             # Path to kubernetes config file. If not specified, it defaults to ~/.kube/config
10 |     execution_type: <serial/parallel>   # Used to specify whether you want to apply filters on interfaces one at a time or all at once. Default is 'parallel'
11 |     network_params:                     # latency, loss and bandwidth are the three supported network parameters to alter for the chaos test
12 |         latency: <time>                 # Value is a string. For example : 50ms
13 |         loss: <fraction>                # Loss is a fraction between 0 and 1. It has to be enclosed in quotes to treat it as a string. For example, '0.02' (not 0.02)       
14 |         bandwidth: <rate>               # Value is a string. For example: 100mbit
15 |     wait_duration: <time_duration>      # Default is 300. Ensure that it is at least about twice of test_duration
16 |     test_duration: <time_duration>      # Default is 120
17 |     kraken_config: <path>               # Specify this if you want to use Cerberus config


--------------------------------------------------------------------------------
/scenarios/openshift/openshift-apiserver.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^openshift-apiserver$
5 |     label_selector: app=openshift-apiserver-a
6 |     krkn_pod_recovery_time: 120
7 | 
8 | 


--------------------------------------------------------------------------------
/scenarios/openshift/openshift-kube-apiserver.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^openshift-kube-apiserver$
5 |     label_selector: app=openshift-kube-apiserver
6 |     krkn_pod_recovery_time: 120
7 | 
8 | 


--------------------------------------------------------------------------------
/scenarios/openshift/pod_egress_shaping.yml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=../plugin.schema.json
 2 | - id: pod_egress_shaping
 3 |   config:
 4 |     namespace: <namespace>              # Required - Namespace of the pod to which traffic shaping need to be applied
 5 |     label_selector: <label_selector>    # When pod_name is not specified, pod with matching label_selector is selected for chaos scenario
 6 |     pod_name: <pod name>                # When label_selector is not specified, pod matching the name will be selected for the chaos scenario
 7 |     network_params:                     # latency, loss and bandwidth are the three supported network parameters to alter for the chaos test
 8 |         latency: <time>                 # Value is a string. For example : 50ms
 9 |         loss: <fraction>                # Loss is a fraction between 0 and 1. It has to be enclosed in quotes to treat it as a string. For example, '0.02%' (not 0.02%)       
10 |         bandwidth: <rate>               # Value is a string. For example: 100mbit
11 |     execution_type: <serial/parallel>   # Used to specify whether you want to apply filters on interfaces one at a time or all at once. Default is 'parallel'
12 |     instance_count: <number>            # Number of pods to perform action/select that match the label selector
13 |     wait_duration: <time_duration>      # Default is 300. Ensure that it is at least about twice of test_duration
14 |     test_duration: <time_duration>      # Default is 120 


--------------------------------------------------------------------------------
/scenarios/openshift/pod_ingress_shaping.yml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=../plugin.schema.json
 2 | - id: pod_ingress_shaping
 3 |   config:
 4 |     namespace: <namespace>              # Required - Namespace of the pod to which traffic shaping need to be applied
 5 |     label_selector: <label_selector>    # When pod_name is not specified, pod with matching label_selector is selected for chaos scenario
 6 |     pod_name: <pod name>                # When label_selector is not specified, pod matching the name will be selected for the chaos scenario
 7 |     network_params:                     # latency, loss and bandwidth are the three supported network parameters to alter for the chaos test
 8 |         latency: <time>                 # Value is a string. For example : 50ms
 9 |         loss: <fraction>                # Loss is a fraction between 0 and 1. It has to be enclosed in quotes to treat it as a string. For example, '0.02%' (not 0.02%)       
10 |         bandwidth: <rate>               # Value is a string. For example: 100mbit
11 |     execution_type: <serial/parallel>   # Used to specify whether you want to apply filters on interfaces one at a time or all at once. Default is 'parallel'
12 |     instance_count: <number>            # Number of pods to perform action/select that match the label selector
13 |     wait_duration: <time_duration>      # Default is 300. Ensure that it is at least about twice of test_duration
14 |     test_duration: <time_duration>      # Default is 120 
15 | 


--------------------------------------------------------------------------------
/scenarios/openshift/pod_network_outage.yml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=../plugin.schema.json
 2 | - id: pod_network_outage
 3 |   config:
 4 |     namespace: <namespace>              # Required - Namespace of the pod to which filter need to be applied
 5 |     direction:                          # Optioinal - List of directions to apply filters
 6 |         - <egress/ingress>              # Default both egress and ingress
 7 |     ingress_ports:                      # Optional - List of ports to block traffic on
 8 |         - <port number>                 # Default [], i.e. all ports
 9 |     egress_ports:                       # Optional - List of ports to block traffic on
10 |         - <port number>                 # Default [], i.e. all ports
11 |     pod_name: <pod name>                # When label_selector is not specified, pod matching the name will be selected for the chaos scenario
12 |     label_selector: <label_selector>    # When pod_name is not specified, pod with matching label_selector is selected for chaos scenario
13 |     instance_count: <number>            # Number of nodes to perform action/select that match the label selector
14 |     wait_duration: <time_duration>      # Default is 300. Ensure that it is at least about twice of test_duration
15 |     test_duration: <time_duration>      # Default is 120
16 | 


--------------------------------------------------------------------------------
/scenarios/openshift/prom_kill.yml:
--------------------------------------------------------------------------------
1 | - id: kill-pods
2 |   config:
3 |     namespace_pattern: ^openshift-monitoring$
4 |     label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
5 |     krkn_pod_recovery_time: 120


--------------------------------------------------------------------------------
/scenarios/openshift/prometheus.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^openshift-monitoring$
5 |     label_selector: app=prometheus
6 |     krkn_pod_recovery_time: 120
7 | 


--------------------------------------------------------------------------------
/scenarios/openshift/pvc_scenario.yaml:
--------------------------------------------------------------------------------
1 | pvc_scenario:
2 |   pvc_name: <pvc_name>          # Name of the target PVC
3 |   pod_name: <pod_name>          # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined
4 |   namespace: <namespace_name>   # Namespace where the PVC is
5 |   fill_percentage: 50           # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99
6 |   duration: 60                  # Duration in seconds for the fault
7 |   block_size: 102400            # used only by dd if fallocate not present in the container
8 | 


--------------------------------------------------------------------------------
/scenarios/openshift/regex_namespace.yaml:
--------------------------------------------------------------------------------
1 | scenarios:
2 | - namespace: "^.*$"
3 |   delete_count: 1
4 |   runs: 2
5 |   sleep: 15
6 |   wait_time: 300
7 | 


--------------------------------------------------------------------------------
/scenarios/openshift/regex_openshift_pod_kill.yml:
--------------------------------------------------------------------------------
1 | # yaml-language-server: $schema=../plugin.schema.json
2 | - id: kill-pods
3 |   config:
4 |     namespace_pattern: ^openshift-.*$
5 |     name_pattern: .*
6 |     kill: 3
7 |     krkn_pod_recovery_time: 120
8 | 


--------------------------------------------------------------------------------
/scenarios/openshift/time_scenarios_example.yml:
--------------------------------------------------------------------------------
 1 | time_scenarios:
 2 |   - action: skew_time
 3 |     object_type: pod
 4 |     namespace: openshift-etcd
 5 |     container_name: etcd
 6 |     label_selector: app=etcd
 7 |   - action: skew_date
 8 |     object_type: node
 9 |     label_selector: node-role.kubernetes.io/worker
10 | 


--------------------------------------------------------------------------------
/scenarios/openshift/vmware_node_scenarios.yml:
--------------------------------------------------------------------------------
 1 | node_scenarios:
 2 |   - actions:
 3 |     - node_reboot_scenario
 4 |     node_name:
 5 |     label_selector: node-role.kubernetes.io/worker
 6 |     instance_count: 1
 7 |     timeout: 120
 8 |     cloud_type: vmware
 9 |   - actions:
10 |     - node_stop_start_scenario
11 |     node_name:
12 |     label_selector: node-role.kubernetes.io/worker
13 |     instance_count: 1
14 |     timeout: 360
15 |     duration: 10
16 |     cloud_type: vmware
17 |     parallel: false
18 | 


--------------------------------------------------------------------------------
/scenarios/openshift/zone_outage.yaml:
--------------------------------------------------------------------------------
1 | zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
2 |   cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
3 |   duration: 600                                      # duration in seconds after which the zone will be back online
4 |   vpc_id:                                            # cluster virtual private network to target
5 |   subnet_id: [subnet1, subnet2]                      # List of subnet-id's to deny both ingress and egress traffic
6 |   default_acl_id: acl-xxxxxxxx                       # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
7 | 


--------------------------------------------------------------------------------
/scenarios/openshift/zone_outage_gcp.yaml:
--------------------------------------------------------------------------------
1 | zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
2 |   cloud_type: gcp                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
3 |   duration: 600                                      # duration in seconds after which the zone will be back online
4 |   zone: <zone>                    # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
5 | 


--------------------------------------------------------------------------------
/scenarios/plugin.schema.README.md:
--------------------------------------------------------------------------------
1 | This file is generated by running the "plugins" module in the kraken project:
2 | 
3 | ```
4 | python -m kraken.plugins >scenarios/plugin.schema.json
5 | ```


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | import _thread
 4 | from http.server import HTTPServer, BaseHTTPRequestHandler
 5 | from http.client import HTTPConnection
 6 | 
 7 | server_status = ""
 8 | 
 9 | class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
10 |     """
11 |     A simple http server to publish the cerberus status file content
12 |     """
13 | 
14 |     requests_served = 0
15 | 
16 |     def do_GET(self):
17 |         if self.path == "/":
18 |             self.do_status()
19 | 
20 |     def do_status(self):
21 |         self.send_response(200)
22 |         self.end_headers()
23 |         self.wfile.write(bytes(server_status, encoding='utf8'))
24 |         SimpleHTTPRequestHandler.requests_served += 1
25 | 
26 |     def do_POST(self):
27 |         if self.path == "/STOP":
28 |             self.set_stop()
29 |         elif self.path == "/RUN":
30 |             self.set_run()
31 |         elif self.path == "/PAUSE":
32 |             self.set_pause()
33 | 
34 |     def set_run(self):
35 |         self.send_response(200)
36 |         self.end_headers()
37 |         global server_status
38 |         server_status = 'RUN'
39 | 
40 |     def set_stop(self):
41 |         self.send_response(200)
42 |         self.end_headers()
43 |         global server_status
44 |         server_status = 'STOP'
45 | 
46 |     def set_pause(self):
47 |         self.send_response(200)
48 |         self.end_headers()
49 |         global server_status
50 |         server_status = 'PAUSE'
51 | 
52 | def publish_kraken_status(status):
53 |     global server_status
54 |     server_status = status
55 | 
56 | def start_server(address, status):
57 |     server = address[0]
58 |     port = address[1]
59 |     global httpd
60 |     httpd = HTTPServer(address, SimpleHTTPRequestHandler)
61 |     logging.info("Starting http server at http://%s:%s\n" % (server, port))
62 |     try:
63 |         _thread.start_new_thread(httpd.serve_forever, ())
64 |         publish_kraken_status(status)
65 |     except Exception as e:
66 |         logging.error(
67 |             "Failed to start the http server \
68 |                       at http://%s:%s"
69 |             % (server, port)
70 |         )
71 |         sys.exit(1)
72 | 
73 | 
74 | def get_status(address):
75 |     server = address[0]
76 |     port = address[1]
77 |     httpc = HTTPConnection(server, port)
78 |     logging.info("connection set up")
79 |     httpc.request("GET", "/")
80 |     response = httpc.getresponse()
81 |     status = response.read()
82 |     logging.info("response " + str(status.decode()))
83 |     return status.decode()
84 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = kraken
 3 | description = Chaos and resiliency testing tool
 4 | author = chaitanyaenr
 5 | author-email = nelluri@redhat.com
 6 | license = Apache License 2.0
 7 | long-description = file: README.md
 8 | long-description-content-type = text/markdown; charset=UTF-8
 9 | classifiers =
10 |     Development Status :: 4 - Beta
11 |     Programming Language :: Python
12 | 
13 | [options]
14 | zip_safe = False
15 | packages = find:
16 | include_package_data = True
17 | package_dir =
18 |     =kraken
19 | # Add here dependencies of your project (semicolon/line-separated), e.g.
20 | install_requires = PyYAML
21 | # tests_require = pytest; pytest-cov
22 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
23 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
24 | 
25 | [options.packages.find]
26 | where = src
27 | 
28 | [options.extras_require]
29 | # Add here additional requirements for extra features, to install with:
30 | # `pip install touchstone[PDF]` like:
31 | # PDF = ReportLab; RXP
32 | 
33 | [aliases]
34 | dists = bdist_wheel
35 | 
36 | [bdist_wheel]
37 | # Use this option if your package is pure-python
38 | universal = 1
39 | 
40 | [flake8]
41 | # Ignore specified error codes
42 | extend-ignore = W503, E203
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | from pkg_resources import VersionConflict, require
 5 | from setuptools import setup
 6 | 
 7 | try:
 8 |     require("setuptools>=38.3")
 9 | except VersionConflict:
10 |     print("Error: version of setuptools is too old (<38.3)!")
11 |     sys.exit(1)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     setup()
16 | 


--------------------------------------------------------------------------------
/tests/test_ingress_network_plugin.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | from arcaflow_plugin_sdk import plugin
 4 | 
 5 | from krkn.scenario_plugins.native.network import ingress_shaping
 6 | 
 7 | 
 8 | class NetworkScenariosTest(unittest.TestCase):
 9 | 
10 |     def test_serialization(self):
11 |         plugin.test_object_serialization(
12 |             ingress_shaping.NetworkScenarioConfig(
13 |                 node_interface_name={"foo": ["bar"]},
14 |                 network_params={
15 |                     "latency": "50ms",
16 |                     "loss": "0.02",
17 |                     "bandwidth": "100mbit",
18 |                 },
19 |             ),
20 |             self.fail,
21 |         )
22 |         plugin.test_object_serialization(
23 |             ingress_shaping.NetworkScenarioSuccessOutput(
24 |                 filter_direction="ingress",
25 |                 test_interfaces={"foo": ["bar"]},
26 |                 network_parameters={
27 |                     "latency": "50ms",
28 |                     "loss": "0.02",
29 |                     "bandwidth": "100mbit",
30 |                 },
31 |                 execution_type="parallel",
32 |             ),
33 |             self.fail,
34 |         )
35 |         plugin.test_object_serialization(
36 |             ingress_shaping.NetworkScenarioErrorOutput(
37 |                 error="Hello World",
38 |             ),
39 |             self.fail,
40 |         )
41 | 
42 |     def test_network_chaos(self):
43 |         output_id, output_data = ingress_shaping.network_chaos(
44 |             params=ingress_shaping.NetworkScenarioConfig(
45 |                 label_selector="node-role.kubernetes.io/control-plane",
46 |                 instance_count=1,
47 |                 network_params={
48 |                     "latency": "50ms",
49 |                     "loss": "0.02",
50 |                     "bandwidth": "100mbit",
51 |                 },
52 |             ),
53 |             run_id="network-shaping-test",
54 |         )
55 |         if output_id == "error":
56 |             logging.error(output_data.error)
57 |             self.fail(
58 |                 "The network chaos scenario did not complete successfully "
59 |                 "because an error/exception occurred"
60 |             )
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/tests/test_run_python_plugin.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import unittest
 3 | 
 4 | from krkn.scenario_plugins.native.run_python_plugin import (
 5 |     RunPythonFileInput,
 6 |     run_python_file,
 7 | )
 8 | 
 9 | 
10 | class RunPythonPluginTest(unittest.TestCase):
11 |     def test_success_execution(self):
12 |         tmp_file = tempfile.NamedTemporaryFile()
13 |         tmp_file.write(bytes("print('Hello world!')", "utf-8"))
14 |         tmp_file.flush()
15 |         output_id, output_data = run_python_file(
16 |             params=RunPythonFileInput(tmp_file.name),
17 |             run_id="test-python-plugin-success",
18 |         )
19 |         self.assertEqual("success", output_id)
20 |         self.assertEqual("Hello world!\n", output_data.stdout)
21 | 
22 |     def test_error_execution(self):
23 |         tmp_file = tempfile.NamedTemporaryFile()
24 |         tmp_file.write(
25 |             bytes("import sys\nprint('Hello world!')\nsys.exit(42)\n", "utf-8")
26 |         )
27 |         tmp_file.flush()
28 |         output_id, output_data = run_python_file(
29 |             params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-error"
30 |         )
31 |         self.assertEqual("error", output_id)
32 |         self.assertEqual(42, output_data.exit_code)
33 |         self.assertEqual("Hello world!\n", output_data.stdout)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     unittest.main()
38 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | deployers:
 3 |   image:
 4 |     deployer_name: podman
 5 |     deployment:
 6 |       imagePullPolicy: IfNotPresent
 7 |   python:
 8 |     deployer_name: python
 9 |     modulePullPolicy: Always
10 |     pythonPath: /usr/bin/python
11 |     workdir: /tmp
12 | log:
13 |   level: debug
14 | logged_outputs:
15 |   error:
16 |     level: debug
17 |   success:
18 |     level: debug
19 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/input.yaml:
--------------------------------------------------------------------------------
 1 | kubernetes_target:
 2 |   kubeconfig_path: 
 3 | cpu_hog_enabled: true
 4 | pod_chaos_enabled: true
 5 | kubeburner_enabled: true
 6 | 
 7 | kubeburner_list:
 8 |   - kubeburner:
 9 |       kubeconfig: 'given later in workflow by kubeconfig plugin'
10 |       workload: 'cluster-density'
11 |       qps: 20
12 |       burst: 20
13 |       log_level: 'info'
14 |       timeout: '1m'
15 |       iterations: 1
16 |       churn: 'true'
17 |       churn_duration: 1s
18 |       churn_delay: 1s
19 |       churn_percent: 10
20 |       alerting: 'true'
21 |       gc: 'true'
22 | 
23 | pod_chaos_list:
24 |   - namespace_pattern: ^openshift-etcd$
25 |     label_selector: k8s-app=etcd
26 |     kill: 1
27 |     krkn_pod_recovery_time: 1
28 | 
29 | cpu_hog_list:
30 |   - namespace: default
31 |     # set the node selector as a key-value pair eg.
32 |     # node_selector:
33 |     #  kubernetes.io/hostname: kind-worker2
34 |     node_selector: {}
35 |     stressng_params:
36 |       timeout: 1
37 |       stressors:
38 |         - stressor: cpu
39 |           workers: 1
40 |           cpu-load: 20
41 |           cpu-method: all
42 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/subworkflows/cpu-hog.yaml:
--------------------------------------------------------------------------------
 1 | version: v0.2.0
 2 | input:
 3 |   root: CpuHog__KubernetesTarget
 4 |   objects:
 5 |     CpuHog__KubernetesTarget:
 6 |       id: CpuHog__KubernetesTarget
 7 |       properties:
 8 |         constant:
 9 |           type:
10 |             type_id: ref
11 |             id: KubernetesTarget
12 |         item:
13 |           type:
14 |             type_id: ref
15 |             id: CpuHog
16 |     KubernetesTarget:
17 |       id: KubernetesTarget
18 |       properties:
19 |         kubeconfig_path:
20 |           type:
21 |             type_id: string
22 |     CpuHog:
23 |       id: CpuHog
24 |       properties:
25 |         namespace:
26 |           display:
27 |             description: The namespace where the container will be deployed
28 |             name: Namespace
29 |           type:
30 |             type_id: string
31 |           required: true
32 |         node_selector:
33 |           display:
34 |             description: kubernetes node name where the plugin must be deployed
35 |           type:
36 |             type_id: map
37 |             values:
38 |               type_id: string
39 |             keys:
40 |               type_id: string
41 |           required: true
42 |         stressng_params:
43 |           type:
44 |             type_id: ref
45 |             id: StressNGParams
46 |             namespace: $.steps.stressng.starting.inputs.input
47 | 
48 | steps:
49 |   kubeconfig:
50 |     plugin:
51 |       src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.3.1
52 |       deployment_type: image
53 |     input:
54 |       kubeconfig: !expr 'readFile($.input.constant.kubeconfig_path)'
55 |   stressng:
56 |     plugin:
57 |       src: quay.io/arcalot/arcaflow-plugin-stressng:0.8.0
58 |       deployment_type: image
59 |     step: workload
60 |     input: !expr $.input.item.stressng_params
61 |     deploy:
62 |       deployer_name: kubernetes
63 |       connection: !expr $.steps.kubeconfig.outputs.success.connection
64 |       pod:
65 |         metadata:
66 |           namespace: !expr $.input.item.namespace
67 |           labels:
68 |             arcaflow: stressng
69 |         spec:
70 |           nodeSelector: !expr $.input.item.node_selector
71 |           pluginContainer:
72 |             imagePullPolicy: Always
73 | 
74 | outputs:
75 |   success: !expr $.steps.stressng.outputs.success
76 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/subworkflows/kubeburner.yaml:
--------------------------------------------------------------------------------
 1 | version: v0.2.0
 2 | input:
 3 |   root: KubeBurner__KubernetesTarget
 4 |   objects:
 5 |     KubeBurner__KubernetesTarget:
 6 |       id: KubeBurner__KubernetesTarget
 7 |       properties:
 8 |         constant:
 9 |           type:
10 |             type_id: ref
11 |             id: KubernetesTarget
12 |         item:
13 |           type:
14 |             type_id: ref
15 |             id: KubeBurner
16 |     KubernetesTarget:
17 |       id: KubernetesTarget
18 |       properties:
19 |         kubeconfig_path:
20 |           type:
21 |             type_id: string    
22 |     KubeBurner:
23 |       id: KubeBurner
24 |       properties:
25 |         kubeburner:
26 |           type:
27 |             type_id: ref
28 |             id: KubeBurnerInputParams
29 |             namespace: $.steps.kubeburner.starting.inputs.input        
30 | 
31 | steps:
32 |   uuidgen:
33 |     plugin:
34 |       deployment_type: image
35 |       src: quay.io/arcalot/arcaflow-plugin-utilities:0.6.0
36 |     step: uuid
37 |     input: {}
38 |   kubeburner:
39 |     plugin:
40 |       deployment_type: image
41 |       src: quay.io/redhat-performance/arcaflow-plugin-kube-burner:latest
42 |     step: kube-burner
43 |     input:
44 |       kubeconfig: !expr 'readFile($.input.constant.kubeconfig_path)'
45 |       uuid: !expr $.steps.uuidgen.outputs.success.uuid
46 |       workload: !expr $.input.item.kubeburner.workload
47 |       iterations: !expr $.input.item.kubeburner.iterations
48 |       churn: !expr $.input.item.kubeburner.churn
49 |       churn_duration: !expr $.input.item.kubeburner.churn_duration
50 |       churn_delay: !expr $.input.item.kubeburner.churn_delay
51 | 
52 | outputs:
53 |   success:
54 |     burner: !expr $.steps.kubeburner.outputs.success
55 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/subworkflows/pod-chaos.yaml:
--------------------------------------------------------------------------------
  1 | version: v0.2.0
  2 | input:
  3 |   root: KillPodConfig__KubernetesTarget
  4 |   objects:
  5 |     KillPodConfig__KubernetesTarget:
  6 |       id: KillPodConfig__KubernetesTarget
  7 |       properties:
  8 |         constant:
  9 |           type:
 10 |             type_id: ref
 11 |             id: KubernetesTarget
 12 |         item:
 13 |           type:
 14 |             type_id: ref
 15 |             id: KillPodConfig
 16 |     KubernetesTarget:
 17 |       id: KubernetesTarget
 18 |       properties:
 19 |         kubeconfig_path:
 20 |           type:
 21 |             type_id: string  
 22 |     KillPodConfig:
 23 |       id: KillPodConfig
 24 |       properties:
 25 |         backoff:
 26 |           default: '1'
 27 |           display:
 28 |             description: How many seconds to wait between checks for the target
 29 |               pod status.
 30 |             name: Backoff
 31 |           required: false
 32 |           type:
 33 |             type_id: integer
 34 |         kill:
 35 |           default: '1'
 36 |           display:
 37 |             description: How many pods should we attempt to kill?
 38 |             name: Number of pods to kill
 39 |           required: false
 40 |           type:
 41 |             min: 1
 42 |             type_id: integer
 43 |         krkn_pod_recovery_time:
 44 |           default: '60'
 45 |           display:
 46 |             description: The Expected Recovery time fo the pod (used by Krkn to
 47 |               monitor the pod lifecycle)
 48 |             name: Recovery Time
 49 |           required: false
 50 |           type:
 51 |             type_id: integer
 52 |         label_selector:
 53 |           display:
 54 |             description: 'Kubernetes label selector for the target pods. Required
 55 |               if name_pattern is not set.
 56 |               See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
 57 |               for details.'
 58 |             name: Label selector
 59 |           required: false
 60 |           required_if_not:
 61 |           - name_pattern
 62 |           type:
 63 |             type_id: string
 64 |         name_pattern:
 65 |           display:
 66 |             description: Regular expression for target pods. Required if label_selector
 67 |               is not set.
 68 |             name: Name pattern
 69 |           required: false
 70 |           required_if_not:
 71 |           - label_selector
 72 |           type:
 73 |             type_id: pattern
 74 |         namespace_pattern:
 75 |           display:
 76 |             description: Regular expression for target pod namespaces.
 77 |             name: Namespace pattern
 78 |           required: true
 79 |           type:
 80 |             type_id: pattern
 81 |         timeout:
 82 |           default: '180'
 83 |           display:
 84 |             description: Timeout to wait for the target pod(s) to be removed in
 85 |               seconds.
 86 |             name: Timeout
 87 |           required: false
 88 |           type:
 89 |             type_id: integer
 90 | 
 91 | steps:
 92 |   kill_pod:
 93 |     step: kill-pods
 94 |     plugin:
 95 |       deployment_type: python
 96 |       src: arcaflow-plugin-kill-pod@git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@a9f87f88d8e7763d111613bd8b2c7862fc49624f
 97 |     input:
 98 |       namespace_pattern: !expr $.input.item.namespace_pattern
 99 |       label_selector: !expr $.input.item.label_selector
100 |       kubeconfig_path: !expr $.input.constant.kubeconfig_path
101 |     deploy:
102 |       deployer_name: python
103 |       modulePullPolicy: Always
104 |       pythonPath: /usr/bin/python
105 |       workdir: /tmp
106 | 
107 | outputs:
108 |   success: !expr $.steps.kill_pod.outputs.success
109 | 


--------------------------------------------------------------------------------
/utils/arcaflow/ocp-chaos/workflow.yaml:
--------------------------------------------------------------------------------
 1 | version: v0.2.0
 2 | input:
 3 |   root: RootObject
 4 |   objects:
 5 |     KubernetesTarget:
 6 |       id: KubernetesTarget
 7 |       properties:
 8 |         kubeconfig_path:
 9 |           type:
10 |             type_id: string
11 |     RootObject:
12 |       id: RootObject
13 |       properties:
14 |         cpu_hog_enabled:
15 |           type:
16 |             type_id: bool
17 |         pod_chaos_enabled:
18 |           type:
19 |             type_id: bool
20 |         kubeburner_enabled:
21 |           type:
22 |             type_id: bool
23 |         kubernetes_target:
24 |           type:
25 |             type_id: ref
26 |             id: KubernetesTarget
27 |         kubeburner_list:
28 |           type:
29 |             type_id: list
30 |             items:
31 |               type_id: ref
32 |               id: KubeBurner
33 |               namespace: $.steps.kubeburner_wf.execute.inputs.items
34 |         pod_chaos_list:
35 |           type:
36 |             type_id: list
37 |             items:
38 |               type_id: ref
39 |               id: KillPodConfig
40 |               namespace: $.steps.pod_chaos_wf.execute.inputs.items
41 |         cpu_hog_list:
42 |           type:
43 |             type_id: list
44 |             items:
45 |               type_id: ref
46 |               id: CpuHog
47 |               namespace: $.steps.cpu_hog_wf.execute.inputs.items
48 | 
49 | steps:
50 |   kubeburner_wf:
51 |     kind: foreach
52 |     items: !expr 'bindConstants($.input.kubeburner_list, $.input.kubernetes_target)'
53 |     workflow: subworkflows/kubeburner.yaml
54 |     parallelism: 1
55 |     enabled: !expr $.input.kubeburner_enabled
56 |   pod_chaos_wf:
57 |     kind: foreach
58 |     items: !expr 'bindConstants($.input.pod_chaos_list, $.input.kubernetes_target)'
59 |     workflow: subworkflows/pod-chaos.yaml
60 |     parallelism: 1
61 |     enabled: !expr $.input.pod_chaos_enabled
62 |   cpu_hog_wf:
63 |     kind: foreach
64 |     items: !expr 'bindConstants($.input.cpu_hog_list, $.input.kubernetes_target)'
65 |     workflow: subworkflows/cpu-hog.yaml
66 |     parallelism: 1
67 |     enabled: !expr $.input.cpu_hog_enabled
68 | 
69 | outputs:
70 |   workflow_success:
71 |     kubeburner: !ordisabled $.steps.kubeburner_wf.outputs.success
72 |     pod_chaos: !ordisabled $.steps.pod_chaos_wf.outputs.success
73 |     cpu_hog: !ordisabled $.steps.cpu_hog_wf.outputs.success
74 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/README.md:
--------------------------------------------------------------------------------
 1 | # aichaos
 2 | Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
 3 | 
 4 | ## Generate python package wheel file
 5 | ```
 6 | $ python3.9 generate_wheel_package.py sdist bdist_wheel
 7 | $ cp dist/aichaos-0.0.1-py3-none-any.whl docker/
 8 | ```
 9 | This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder. 
10 | 
11 | ## Build Image
12 | ```
13 | $ cd docker
14 | $ podman build -t aichaos:1.0 .
15 | OR
16 | $ docker build -t aichaos:1.0 .
17 | ```
18 | 
19 | ## Run Chaos AI
20 | ```
21 | $ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
22 | OR
23 | $ docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
24 | ```
25 | 
26 | The output should look like:
27 | ```
28 | $ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
29 |  * Serving Flask app 'swagger_api' (lazy loading)
30 |  * Environment: production
31 |    WARNING: This is a development server. Do not use it in a production deployment.
32 |    Use a production WSGI server instead.
33 |  * Debug mode: on
34 | WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
35 |  * Running on all addresses (0.0.0.0)
36 |  * Running on http://127.0.0.1:5001
37 |  * Running on http://172.17.0.2:5001
38 | ```
39 | 
40 | You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
41 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/config/experiments/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/utils/chaos_ai/config/experiments/.gitkeep


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM bitnami/kubectl:1.20.9 as kubectl
 2 | FROM python:3.9
 3 | WORKDIR /app
 4 | RUN pip3 install --upgrade pip
 5 | COPY config config/
 6 | COPY requirements.txt .
 7 | RUN mkdir -p /app/logs
 8 | RUN pip3 install -r requirements.txt
 9 | 
10 | COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
11 | 
12 | COPY swagger_api.py .
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
16 | 
17 | RUN apt-get update && apt-get install -y podman
18 | 
19 | COPY aichaos-0.0.1-py3-none-any.whl .
20 | RUN pip3 install aichaos-0.0.1-py3-none-any.whl
21 | CMD ["python3", "swagger_api.py"]
22 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/aichaos-config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "command": "podman",
3 |   "chaosengine": "kraken",
4 |   "faults": "pod-delete",
5 |   "iterations": 1,
6 |   "maxfaults": 5
7 | }
8 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/experiments/log.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |     Get Log from the Chaos ID.---
 3 |     tags:
 4 |       - ChaosAI API Results
 5 |     parameters:
 6 |       - name: chaosid
 7 |         in: path
 8 |         type: string
 9 |         required: true
10 |         description: Chaos-ID
11 |     responses:
12 |       500:
13 |         description: Error!
14 |       200:
15 |         description: Results for the given Chaos ID.
16 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/pod-delete.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "apiVersion": "1.0",
 3 |   "kind": "ChaosEngine",
 4 |   "metadata": {
 5 |     "name": "engine-cartns3"
 6 |   },
 7 |   "spec": {
 8 |     "engineState": "active",
 9 |     "annotationCheck": "false",
10 |     "appinfo": {
11 |       "appns": "robot-shop",
12 |       "applabel": "service=payment",
13 |       "appkind": "deployment"
14 |     },
15 |     "chaosServiceAccount": "pod-delete-sa",
16 |     "experiments": [
17 |       {
18 |         "name": "pod-delete",
19 |         "spec": {
20 |           "components": {
21 |             "env": [
22 |               {
23 |                 "name": "FORCE",
24 |                 "value": "true"
25 |               },
26 |               {
27 |                 "name": "TOTAL_CHAOS_DURATION",
28 |                 "value": "120"
29 |               }
30 |             ]
31 |           }
32 |         }
33 |       }
34 |     ]
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/yml/chaosGen.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | Generate chaos on an application deployed on a cluster.
 3 | ---
 4 |     tags:
 5 |       - ChaosAI API
 6 |     parameters:
 7 |       - name: file
 8 |         in: formData
 9 |         type: file
10 |         required: true
11 |         description: Kube-config file
12 |       - name: namespace
13 |         in: formData
14 |         type: string
15 |         default: robot-shop
16 |         required: true
17 |         description: Namespace to test
18 |       - name: podlabels
19 |         in: formData
20 |         type: string
21 |         default: service=cart,service=payment
22 |         required: true
23 |         description: Pod labels to test
24 |       - name: nodelabels
25 |         in: formData
26 |         type: string
27 |         required: false
28 |         description: Node labels to test
29 |       - name: urls
30 |         in: formData
31 |         type: string
32 |         default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
33 |         required: true
34 |         description: Application URLs to test
35 | 
36 |     responses:
37 |       500:
38 |         description: Error!
39 |       200:
40 |         description: Chaos ID for the initiated chaos.
41 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/yml/episodes.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |     Get Episodes from the Chaos ID.---
 3 |     tags:
 4 |       - ChaosAI API Results
 5 |     parameters:
 6 |       - name: chaosid
 7 |         in: path
 8 |         type: string
 9 |         required: true
10 |         description: Chaos-ID
11 |     responses:
12 |       500:
13 |         description: Error!
14 |       200:
15 |         description: Results for the given Chaos ID.
16 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/yml/log.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |     Get Log from the Chaos ID.---
 3 |     tags:
 4 |       - ChaosAI API Results
 5 |     parameters:
 6 |       - name: chaosid
 7 |         in: path
 8 |         type: string
 9 |         required: true
10 |         description: Chaos-ID
11 |     responses:
12 |       500:
13 |         description: Error!
14 |       200:
15 |         description: Results for the given Chaos ID.
16 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/yml/qtable.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |     Get QTable from the Chaos ID.---
 3 |     tags:
 4 |       - ChaosAI API Results
 5 |     parameters:
 6 |       - name: chaosid
 7 |         in: path
 8 |         type: string
 9 |         required: true
10 |         description: Chaos-ID
11 |     responses:
12 |       500:
13 |         description: Error!
14 |       200:
15 |         description: Results for the given Chaos ID.
16 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/config/yml/status.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |      Get status of the Constraints ID.---
 3 |     tags:
 4 |       - ChaosAI API
 5 |     parameters:
 6 |       - name: chaosid
 7 |         in: path
 8 |         type: string
 9 |         required: true
10 |         description: Chaos-ID
11 |     responses:
12 |       500:
13 |         description: Error!
14 |       200:
15 |         description: Chaos for the given ID.
16 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | requests
4 | Flask==2.2.5
5 | Werkzeug==3.0.3
6 | flasgger==0.9.5
7 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/generate_wheel_package.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | # from setuptools_cythonize import get_cmdclass
 3 | 
 4 | setuptools.setup(
 5 |     # cmdclass=get_cmdclass(),
 6 |     name="aichaos",
 7 |     version="0.0.1",
 8 |     author="Sandeep Hans",
 9 |     author_email="shans001@in.ibm.com",
10 |     description="Chaos AI",
11 |     long_description="Chaos Engineering using AI",
12 |     long_description_content_type="text/markdown",
13 |     url="",
14 |     packages=setuptools.find_packages(),
15 |     classifiers=[
16 |         "Programming Language :: Python :: 3",
17 |         "License :: OSI Approved :: MIT License",
18 |         "Operating System :: OS Independent",
19 |     ],
20 |     python_requires='>=3.9',
21 | )
22 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | pandas
 3 | notebook
 4 | jupyterlab
 5 | jupyter
 6 | seaborn
 7 | requests
 8 | wheel
 9 | Flask==2.1.0
10 | flasgger==0.9.5
11 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/krkn-chaos/krkn/5bdbf622c32282e1978cc17036afd4096546354a/utils/chaos_ai/src/__init__.py


--------------------------------------------------------------------------------
/utils/chaos_ai/src/experiments.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | class Experiments:
 5 |     def __init__(self):
 6 |         self.k = 0
 7 | 
 8 |     def monotonic(self, aichaos, num_sets=3):
 9 |         for i in range(num_sets):
10 |             faults_pods = random.sample(aichaos.faults, k=2)
11 |             faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
12 | 
13 |             resp1, resp2, resp_both = 0, 0, 0
14 |             for fl in faults_set:
15 |                 engines = []
16 |                 for fp in fl:
17 |                     fault = fp.split(':')[0]
18 |                     pod_name = fp.split(':')[1]
19 |                     engine = aichaos.inject_faults_litmus(fault, pod_name)
20 |                     engines.append(engine)
21 |                 aichaos.litmus.wait_engines(engines)
22 | 
23 |                 for index, url in enumerate(aichaos.urls):
24 |                     start_state, next_state = aichaos.test_load(url)
25 |                     print(i, fl, next_state)
26 |                     # self.write(str(fl), next_state)
27 |                     if resp1 == 0:
28 |                         resp1 = next_state
29 |                     elif resp2 == 0:
30 |                         resp2 = next_state
31 |                     else:
32 |                         resp_both = next_state
33 | 
34 |                 aichaos.litmus.stop_engines()
35 |             self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
36 |         print('Experiment Complete!!!')
37 | 
38 |     @staticmethod
39 |     def write(fault, next_state):
40 |         with open("experiment", "a") as outfile:
41 |             outfile.write(fault + ',' + str(next_state) + ',' + '\n')
42 | 
43 | 
44 |     @staticmethod
45 |     def write_resp(faults, resp1, resp2, resp3):
46 |         monotonic = True
47 |         if resp3 == 200:
48 |             if resp1 != 200 or resp2 != 200:
49 |                 monotonic = False
50 |         else:
51 |             if resp1 == 200 and resp2 == 200:
52 |                 monotonic = False
53 | 
54 |         with open("experiment", "a") as outfile:
55 |             # outfile.write(faults + ',' + str(resp1) + ',' + '\n')
56 |             outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
57 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/src/kraken_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | import logging
  5 | 
  6 | import src.utils as utils
  7 | 
  8 | 
  9 | class KrakenUtils:
 10 |     def __init__(self, namespace='robot-shop', chaos_dir='../config/',
 11 |                  chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
 12 |         self.chaos_dir = chaos_dir
 13 |         self.chaos_experiment = chaos_experiment
 14 |         self.namespace = namespace
 15 |         self.kubeconfig = kubeconfig
 16 |         self.logger = logging.getLogger()
 17 |         self.engines = []
 18 |         self.wait_checks = wait_checks
 19 |         self.command = command
 20 | 
 21 |     def exp_status(self, engine='engine-cartns3'):
 22 |         substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
 23 |         substr = '|'.join(substring_list)
 24 |         # cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
 25 |         # cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
 26 |         cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
 27 |         line = os.popen(cmd).read()
 28 |         self.logger.debug('[exp_status]'+line)
 29 |         # if 'Waiting for the specified duration' in line:
 30 |         # if 'Waiting for' in line or 'waiting for' in line:
 31 |         # if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
 32 |         if any(map(line.__contains__, substring_list)):
 33 |             return 'Running'
 34 |         return 'Not Running'
 35 |  
 36 |     # print chaos result, check if litmus showed any error
 37 |     def print_result(self, engines):
 38 |         # self.logger.debug('')
 39 |         for e in engines:
 40 |             # cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
 41 |             # line = os.popen(cmd).read()
 42 |             # self.logger.debug('[Chaos Result] '+e+' : '+line)
 43 |             self.logger.debug('[KRAKEN][Chaos Result] '+e)
 44 | 
 45 |     def wait_engines(self, engines=[]):
 46 |         status = 'Completed'
 47 |         max_checks = self.wait_checks
 48 |         for e in engines:
 49 |             self.logger.info('[Wait Engines] ' + e)
 50 |             for i in range(max_checks):
 51 |                 status = self.exp_status(e)
 52 |                 if status == 'Running':
 53 |                     break
 54 |                 time.sleep(1)
 55 |             # return False, if even one engine is not running
 56 |             if status != 'Running':
 57 |                 return False
 58 | 
 59 |         self.engines = engines
 60 |         # return True if all engines are running
 61 |         return True
 62 | 
 63 | 
 64 |     def cleanup(self):
 65 |         self.logger.debug('Removing previous engines')
 66 |         # cmd = "docker rm $(docker ps -q -f 'status=exited')"
 67 |         if len(self.engines) > 0:
 68 |             cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
 69 |             os.system(cmd)
 70 |         self.engines = []
 71 | 
 72 |         cmd = self.command+" container prune -f >> temp"
 73 |         os.system(cmd)
 74 |         self.logger.debug('Engines removed')
 75 | 
 76 |     def stop_engines(self, episode=[]):
 77 |         self.cleanup()
 78 | 
 79 |     def get_name(self):
 80 |         return 'kraken'
 81 | 
 82 |     def inject_faults(self, fault, pod_name):
 83 |         self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
 84 |         fault, load = utils.get_load(fault)
 85 |         engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
 86 |         if fault == 'pod-delete':
 87 |             cmd = self.command+' run  -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
 88 |         elif fault == 'network-chaos':
 89 |             # 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10  --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'        
 90 |             cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120  --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
 91 |         elif fault == 'node-memory-hog':
 92 |             cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
 93 |         elif fault == 'node-cpu-hog':
 94 |             cmd = self.command+'  run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
 95 |         else:
 96 |             cmd = 'echo'
 97 |         self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
 98 |         os.system(cmd)
 99 |         return engine
100 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/src/qlearning.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class QLearning:
 7 |     def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
 8 |         self.gamma = gamma  # Discount factor
 9 |         self.alpha = alpha  # Learning rate
10 |         self.faults = faults
11 |         self.states = states
12 |         self.rewards = rewards
13 | 
14 |         # Initializing Q-Values
15 |         # self.Q = np.array(np.zeros([len(states), len(states)]))
16 |         self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
17 |         self.state_matrix = np.array(np.zeros([len(states), len(states)]))
18 | 
19 |         self.logger = logging.getLogger()
20 | 
21 |     def update_q_fault(self, fault, episode, start_state, end_state, url_index):
22 |         self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
23 |         if end_state is None:
24 |             end_state = start_state
25 |         if end_state not in self.states:
26 |             end_state = 'Other'
27 |         # reward is dependent on the error response (eg. '404') and length of episode
28 |         reward = self.rewards[str(end_state)] / len(episode)
29 |         current_state = self.states[str(start_state)]
30 |         next_state = self.states[str(end_state)]
31 |         fault_index = self.faults.index(fault)
32 |         # self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
33 |         # self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
34 |         # self.logger.debug(
35 |         #     'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
36 |         # self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
37 | 
38 |         TD = reward + \
39 |              self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
40 |              self.Q[url_index, current_state, fault_index]
41 |         self.Q[url_index, current_state, fault_index] += self.alpha * TD
42 | 
43 |         # update state matrix
44 |         TD_state = reward + \
45 |                    self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
46 |                    self.state_matrix[current_state, next_state]
47 |         self.state_matrix[current_state, next_state] += self.alpha * TD_state
48 |         # self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
49 | 
50 |     # def update_q(self, episode, start_state, end_state):
51 |     #     self.logger.info('[UPDATE_Q]')
52 |     #     if end_state is None:
53 |     #         end_state = start_state
54 |     #
55 |     #     # reward is dependent on the error response (eg. '404') and length of episode
56 |     #     reward = self.rewards[str(end_state)] / len(episode)
57 |     #     current_state = self.states[str(start_state)]
58 |     #     next_state = self.states[str(end_state)]
59 |     #     TD = reward + \
60 |     #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
61 |     #          self.Q[current_state, next_state]
62 |     #     self.Q[current_state, next_state] += self.alpha * TD
63 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/src/test_application.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import time
 4 | import requests
 5 | 
 6 | 
 7 | class TestApplication:
 8 |     def __init__(self, num_requests=10, timeout=2, sleep_time=1):
 9 |         self.num_requests = num_requests
10 |         self.timeout = timeout
11 |         self.sleep_time = sleep_time
12 |         self.logger = logging.getLogger()
13 | 
14 |     def test_load(self, url=''):
15 |         # url = 'http://192.168.49.2:31902/api/cart/health'
16 |         timeout_count = 0
17 |         avg_lat = 0
18 |         for i in range(self.num_requests):
19 |             try:
20 |                 r = requests.get(url, verify=False, timeout=self.timeout)
21 |                 avg_lat += r.elapsed.total_seconds()
22 |                 self.logger.info(
23 |                     url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
24 |                     + " {:.2f}".format(avg_lat))
25 |                 if r.status_code != 200:
26 |                     return '200', r.status_code
27 |             # except requests.exceptions.Timeout as toe:
28 |             except Exception as toe:
29 |                 self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
30 |                 timeout_count += 1
31 |                 if timeout_count > 3:
32 |                     return '200', 'Timeout'
33 |             # except Exception as e:
34 |             #   self.logger.debug('Connection refused!'+str(e))
35 |             time.sleep(self.sleep_time)
36 |         self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
37 |         return '200', '200'
38 | 
39 |     # def test_load_hey(self):
40 |     #     cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
41 |     #     os.system(cmd)
42 |     #     with open('temp') as f:
43 |     #         datafile = f.readlines()
44 |     #     found = False
45 |     #     for line in datafile:
46 |     #         if 'Status code distribution:' in line:
47 |     #             found = True
48 |     #         if found:
49 |     #             print('[test_load]', line)
50 |     #             m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
51 |     #             if m is not None:
52 |     #                 resp_code = m.group(1)
53 |     #                 if resp_code != 200:
54 |     #                     return '200', resp_code
55 |     #     return '200', '200'
56 | 
57 |     # # End state is reached when system is down or return error code like '500','404'
58 |     # def get_next_state(self):
59 |     #     self.logger.info('[GET_NEXT_STATE]')
60 |     #     f = open(self.chaos_dir + self.chaos_journal)
61 |     #     data = json.load(f)
62 |     #
63 |     #     # before the experiment (if before steady state is false, after is null?)
64 |     #     for probe in data['steady_states']['before']['probes']:
65 |     #         if not probe['tolerance_met']:
66 |     #             # start_state = probe['activity']['tolerance']
67 |     #             # end_state = probe['status']
68 |     #             start_state, end_state = None, None
69 |     #             return start_state, end_state
70 |     #
71 |     #     # after the experiment
72 |     #     for probe in data['steady_states']['after']['probes']:
73 |     #         # if probe['output']['status'] == probe['activity']['tolerance']:
74 |     #         if not probe['tolerance_met']:
75 |     #             # print(probe)
76 |     #             start_state = probe['activity']['tolerance']
77 |     #             end_state = probe['output']['status']
78 |     #             # end_state = probe['status']
79 |     #             return start_state, end_state
80 |     #     # if tolerances for all probes are met
81 |     #     start_state = probe['activity']['tolerance']
82 |     #     end_state = probe['activity']['tolerance']
83 |     #     return start_state, end_state
84 | 


--------------------------------------------------------------------------------
/utils/chaos_ai/src/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def get_load(fault):
 5 |     params = re.findall(r'\(.*?\)', fault)
 6 |     load = 100
 7 |     if len(params) > 0:
 8 |         load = params[0].strip('()')
 9 |         fault = fault.strip(params[0])
10 |     return fault, load
11 | 


--------------------------------------------------------------------------------
/utils/chaos_recommender/recommender_config.yaml:
--------------------------------------------------------------------------------
 1 | application: openshift-etcd
 2 | namespaces: openshift-etcd
 3 | labels: app=openshift-etcd
 4 | kubeconfig: ~/.kube/config.yaml
 5 | prometheus_endpoint: <Prometheus_Endpoint>
 6 | auth_token: <Auth_Token>
 7 | scrape_duration: 10m
 8 | chaos_library: "kraken"
 9 | log_level: INFO
10 | json_output_file: False
11 | json_output_folder_path:
12 | 
13 | # for output purpose only do not change if not needed
14 | chaos_tests:
15 |   GENERIC:
16 |     - pod_failure
17 |     - container_failure
18 |     - node_failure
19 |     - zone_outage
20 |     - time_skew
21 |     - namespace_failure
22 |     - power_outage
23 |   CPU:
24 |     - node_cpu_hog
25 |   NETWORK:
26 |     - application_outage
27 |     - node_network_chaos
28 |     - pod_network_chaos
29 |   MEM:
30 |     - node_memory_hog
31 |     - pvc_disk_fill
32 | 
33 | threshold: .7
34 | cpu_threshold: .5
35 | mem_threshold: .5
36 | 


--------------------------------------------------------------------------------