Apply the filters to scrape the failures accordingly.
8 |
9 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/containers/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile for cerberus
2 |
3 | FROM quay.io/openshift/origin-tests:latest as origintests
4 |
5 | FROM quay.io/centos/centos:stream9
6 |
7 | LABEL maintainer="Red Hat Chaos Engineering Team"
8 |
9 | ENV KUBECONFIG /root/.kube/config
10 |
11 | # Copy OpenShift CLI, Kubernetes CLI from origin-tests image
12 | COPY --from=origintests /usr/bin/oc /usr/bin/oc
13 | COPY --from=origintests /usr/bin/kubectl /usr/bin/kubectl
14 |
15 | # Install dependencies
16 | RUN yum install -y git python39 python3-pip gcc python3-devel zlib-devel libjpeg-devel
17 |
18 | RUN git clone https://github.com/krkn-chaos/cerberus.git --branch v1.0.4 /root/cerberus && \
19 | mkdir -p /root/.kube && cd /root/cerberus && pip3 install --upgrade pip && pip3 install -r requirements.txt
20 |
21 | WORKDIR /root/cerberus
22 |
23 | ENTRYPOINT python3 start_cerberus.py --config=config/config.yaml
24 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | repos:
3 | - repo: https://github.com/Lucas-C/pre-commit-hooks
4 | rev: v1.1.1
5 | hooks:
6 | - id: remove-tabs
7 |
8 | - repo: https://github.com/pre-commit/pre-commit-hooks
9 | rev: v2.0.0
10 | hooks:
11 | - id: trailing-whitespace
12 | - id: check-merge-conflict
13 | - id: end-of-file-fixer
14 | - id: check-case-conflict
15 | - id: detect-private-key
16 | - id: check-ast
17 |
18 | - repo: https://github.com/psf/black
19 | rev: 22.3.0
20 | hooks:
21 | - id: black
22 | args: ['--line-length', '120']
23 |
24 | - repo: https://github.com/PyCQA/flake8
25 | rev: '3.7.8'
26 | hooks:
27 | - id: flake8
28 | additional_dependencies: ['pep8-naming']
29 | # Ignore all format-related checks as Black takes care of those.
30 | args: ['--ignore', 'E123,E125,W503', '--select', 'E,W,F', '--max-line-length=120']
31 |
--------------------------------------------------------------------------------
/docs/alerts.md:
--------------------------------------------------------------------------------
1 | # Alerts
2 |
3 | Cerberus consumes the metrics from Prometheus deployed on the cluster to report the alerts.
4 |
5 | When provided the prometheus url and bearer token in the config, Cerberus reports the following alerts:
6 |
7 | - KubeAPILatencyHigh: alerts at the end of each iteration and warns if 99th percentile latency for given requests to the kube-apiserver is above 1 second. It is the official SLI/SLO defined for Kubernetes.
8 |
9 | - High number of etcd leader changes: alerts the user when an increase in etcd leader changes are observed on the cluster. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.
10 |
11 | **NOTE**: The prometheus url and bearer token are automatically picked from the cluster if the distribution is OpenShift since it's the default metrics solution. In case of Kubernetes, they need to be provided in the config if prometheus is deployed.
12 |
--------------------------------------------------------------------------------
/CI/tests/test_slack_integration.sh:
--------------------------------------------------------------------------------
1 | set -xeEo pipefail
2 |
3 | source CI/tests/common.sh
4 |
5 | trap error ERR
6 | trap finish EXIT
7 |
8 | function funtional_test_slack_integration {
9 | if [[ `oc get ns test-namespace` ]]; then
10 | oc delete ns test-namespace
11 | fi
12 | oc create ns test-namespace
13 | sed -i '/watch_namespaces:/a\ - test-namespace\' config/config.yaml
14 | sed -i '/^\([[:space:]]*iterations: *\).*/s//\110/;/^\([[:space:]]*sleep_time: *\).*/s//\12/;/^\([[:space:]]*daemon_mode: *\).*/s//\1False/;' config/config.yaml
15 | day=$( date '+%A' )
16 | sed -i '/^\([[:space:]]*slack_integration: *\).*/s//\1True/;/^\([[:space:]]*'$day': *\).*/s//\1 AAAAAAAAA/;' config/config.yaml
17 | export -f create_and_delete_pod
18 | parallel ::: "python3 start_cerberus.py -c config/config.yaml" create_and_delete_pod
19 | oc delete ns test-namespace
20 | echo "${test_name} test: Success"
21 | }
22 |
23 | funtional_test_slack_integration
24 |
--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 | on:
3 | push:
4 | branches: ['main']
5 |
6 | jobs:
7 | build:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Login in quay
11 | run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
12 | env:
13 | QUAY_USER: ${{ secrets.QUAY_USERNAME }}
14 | QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
15 | - name: Check out code
16 | uses: actions/checkout@master
17 | - name: Build the Docker images
18 | run: docker build --no-cache -t quay.io/krkn-chaos/cerberus containers/
19 | - name: Push the Docker images
20 | run: docker push quay.io/krkn-chaos/cerberus
21 | - name: Remove Image
22 | run: docker rmi -f quay.io/krkn-chaos/cerberus | exit 0
23 | - name: Rebuild krkn-hub
24 | if: startsWith(github.ref, 'refs/tags')
25 | uses: redhat-chaos/actions/krkn-hub@main
26 | with:
27 | QUAY_USER: ${{ secrets.QUAY_USERNAME }}
28 | QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
29 | AUTOPUSH: ${{ secrets.AUTOPUSH }}
30 |
--------------------------------------------------------------------------------
/containers/build_own_image-README.md:
--------------------------------------------------------------------------------
1 | # Building your own Cerberus image
2 |
3 | 1. Git clone the Cerberus repository using `git clone https://github.com/redhat-chaos/cerberus.git`.
4 | 2. Modify the python code and yaml files to address your needs.
5 | 3. Execute `podman build -t :latest .` in the containers directory within cerberus to build an image from a Dockerfile.
6 | 4. Execute `podman run --detach --name :latest` to start a container based on your new image.
7 |
8 | # Building the Cerberus image on IBM Power (ppc64le arch)
9 |
10 | 1. Git clone the Cerberus repository using `git clone https://github.com/redhat-chaos/cerberus.git` on an IBM Power Systems server.
11 | 2. Modify the python code and yaml files to address your needs.
12 | 3. Execute `podman build -t :latest -f Dockerfile-ppc64le` in the containers directory within cerberus to build an image from the Dockerfile for Power.
13 | 4. Execute `podman run --detach --name :latest` to start a container based on your new image.
14 |
--------------------------------------------------------------------------------
/CI/tests/test_detailed_data_inspection.sh:
--------------------------------------------------------------------------------
1 | set -xeEo pipefail
2 |
3 | source CI/tests/common.sh
4 |
5 | trap error ERR
6 | trap finish EXIT
7 |
8 | function funtional_test_detailed_data_inspection {
9 | if [[ `oc get ns test-namespace` ]]; then
10 | oc delete ns test-namespace
11 | fi
12 | oc create ns test-namespace
13 | sed -i '/watch_namespaces:/,/cerberus_publish_status/{//!d}; /watch_namespaces:/a\ - test-namespace\' config/config.yaml
14 | sed -i '/^\([[:space:]]*iterations: *\).*/s//\110/;/^\([[:space:]]*sleep_time: *\).*/s//\12/;/^\([[:space:]]*daemon_mode: *\).*/s//\1False/;/^\([[:space:]]*inspect_components: *\).*/s//\1True/;' config/config.yaml
15 | export -f create_and_delete_pod
16 | parallel ::: "python3 start_cerberus.py -c config/config.yaml" create_and_delete_pod
17 | oc delete ns test-namespace
18 | if [[ ! -d "inspect_data/test-namespace-logs" ]]
19 | then
20 | echo "${test_name} test: Fail"
21 | exit 1
22 | else
23 | echo "${test_name} test: Success"
24 | fi
25 | }
26 |
27 | funtional_test_detailed_data_inspection
28 |
--------------------------------------------------------------------------------
/CI/run_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | ci_dir=$1
5 | ci_test=`echo $1 | sed 's/-/_/g'`
6 |
7 | echo -e "\n======================================================================"
8 | echo -e " CI test for ${ci_test} "
9 | echo -e "======================================================================\n"
10 |
11 | cd $ci_dir
12 |
13 | start_time=`date`
14 |
15 | # Test ci
16 | if /bin/bash CI/tests/$ci_test.sh >> $ci_test.out 2>&1
17 | then
18 | # if the test passes update the results and complete
19 | end_time=`date`
20 | duration=`date -ud@$(($(date -ud"$end_time" +%s)-$(date -ud"$start_time" +%s))) +%T`
21 | echo "$ci_dir: Successful"
22 | echo "$ci_dir: Successful" > ci_results
23 | echo "$ci_test | Pass | $duration" > results.markdown
24 | count=$retries
25 | else
26 | end_time=`date`
27 | duration=`date -ud@$(($(date -ud"$end_time" +%s)-$(date -ud"$start_time" +%s))) +%T`
28 | echo "$ci_dir: Failed"
29 | echo "$ci_dir: Failed" > ci_results
30 | echo "$ci_test | Fail | $duration" > results.markdown
31 | echo "Logs for "$ci_dir
32 | # Display the error log since we have failed to pass
33 | cat $ci_test.out
34 | fi
35 |
--------------------------------------------------------------------------------
/cerberus/invoke/command.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import logging
3 |
4 |
5 | # Invokes a given command and returns the stdout.
6 | # Will stop Cerberus execution with exit code 1.
7 | def invoke(command, timeout=60):
8 | output = ""
9 | try:
10 | output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
11 | except Exception as e:
12 | logging.error("Failed to run %s" % (command))
13 | logging.error("Error: " + str(e))
14 | return output
15 |
16 |
17 | # Invokes a given command and returns the stdout.
18 | # In case of exception, returns message about the impossibility to execute the command instead of stdout.
19 | # It won't stop Cerberus execution but doesn't guarantee that command returns expected stdout.
20 | def optional_invoke(command):
21 | try:
22 | optional_output = subprocess.check_output(command, shell=True, universal_newlines=True)
23 | except Exception:
24 | optional_output = "Result is absent."
25 | logging.info(
26 | "Optional command '%s' can't be executed, but it's not a problem at all. We can continue." % command
27 | )
28 |
29 | return optional_output
30 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = cerberus
3 | version = 1.0.1
4 | description = TO guard cluster
5 | author = chaitanyaenr
6 | author-email = nelluri@redhat.com
7 | license = Apache License 2.0
8 | long-description = file: README.md
9 | long-description-content-type = text/markdown; charset=UTF-8
10 | classifiers =
11 | Development Status :: 4 - Beta
12 | Programming Language :: Python
13 |
14 | [options]
15 | zip_safe = False
16 | packages = find:
17 | include_package_data = True
18 | package_dir =
19 | =cerberus
20 | # Add here dependencies of your project (semicolon/line-separated), e.g.
21 | install_requires = PyYAML
22 | # tests_require = pytest; pytest-cov
23 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
24 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
25 |
26 | [options.packages.find]
27 | where = src
28 |
29 | [options.extras_require]
30 | # Add here additional requirements for extra features, to install with:
31 | # `pip install touchstone[PDF]` like:
32 | # PDF = ReportLab; RXP
33 |
34 | [aliases]
35 | dists = bdist_wheel
36 |
37 | [bdist_wheel]
38 | # Use this option if your package is pure-python
39 | universal = 1
40 |
41 | [flake8]
42 | # Some sane defaults for the code style checker flake8
43 | exclude =
44 | .tox
45 | build
46 | dist
47 | .eggsse
48 |
--------------------------------------------------------------------------------
/containers/cerberus.yml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: cerberus-deployment
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | tool: Cerberus
11 | template:
12 | metadata:
13 | labels:
14 | tool: Cerberus
15 | spec:
16 | serviceAccountName: useroot
17 | containers:
18 | - name: cerberus
19 | securityContext:
20 | privileged: true
21 | image: quay.io/redhat-chaos/cerberus
22 | command: ["/bin/sh", "-c"]
23 | args: ["python3 start_cerberus.py -c config/config.yaml"]
24 | ports:
25 | - containerPort: 8080
26 | volumeMounts:
27 | - mountPath: "/root/.kube"
28 | name: config
29 | - mountPath: "/root/cerberus/config"
30 | name: cerberus-config
31 | volumes:
32 | - name: config
33 | configMap:
34 | name: kube-config
35 | - name: cerberus-config
36 | configMap:
37 | name: cerberus-config
38 | ---
39 | apiVersion: v1
40 | kind: Service
41 | metadata:
42 | name: cerberus-service
43 | spec:
44 | type: NodePort
45 | selector:
46 | tool: Cerberus
47 | ports:
48 | - port: 8080
49 | targetPort: 8080
50 | nodePort: 30000
51 |
--------------------------------------------------------------------------------
/containers/Dockerfile-ppc64le:
--------------------------------------------------------------------------------
1 | # Dockerfile for cerberus for ppc64le arch
2 |
3 | FROM ppc64le/centos:8
4 |
5 | MAINTAINER Red Hat OpenShift Performance and Scale
6 |
7 | ENV KUBECONFIG /root/.kube/config
8 |
9 | # Get kubectl and oc client for ppc64le arch
10 | RUN curl -L -o kubernetes-client-linux-ppc64le.tar.gz https://dl.k8s.io/v1.19.0/kubernetes-client-linux-ppc64le.tar.gz \
11 | && tar xf kubernetes-client-linux-ppc64le.tar.gz && mv kubernetes/client/bin/kubectl /usr/bin/ && rm -rf kubernetes-client-linux-ppc64le.tar.gz
12 |
13 | RUN curl -L -o openshift-client-linux.tar.gz https://mirror.openshift.com/pub/openshift-v4/ppc64le/clients/ocp/stable/openshift-client-linux.tar.gz \
14 | && tar xf openshift-client-linux.tar.gz -C /usr/bin && rm -rf openshift-client-linux.tar.gz
15 |
16 |
17 | # Install dependencies
18 | RUN yum install -y git python36 python3-pip gcc libffi-devel python36-devel openssl-devel gcc-c++ make && \
19 | pip3 install cython && \
20 | pip3 install numpy && \
21 | git clone https://github.com/krkn-chaos/cerberus.git --branch v1.0.4 /root/cerberus && \
22 | mkdir -p /root/.kube && cd /root/cerberus && \
23 | pip3 install -r requirements.txt && \
24 | pip3 install setuptools==40.3.0 && \
25 | pip3 install urllib3==1.25.4
26 |
27 | WORKDIR /root/cerberus
28 |
29 | ENTRYPOINT python3 start_cerberus.py --config=config/config.yaml
30 |
--------------------------------------------------------------------------------
/cerberus/inspect/inspect.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 |
4 | import cerberus.invoke.command as runcommand
5 |
6 |
7 | # 'inspect_data' directory is used to collect logs, events and metrics of
8 | # the failed component. Delete 'inspect_data' directory if it exists.
9 | def delete_inspect_directory():
10 | if os.path.isdir("inspect_data/"):
11 | logging.info("Deleting existing inspect_data directory")
12 | runcommand.invoke("rm -R inspect_data")
13 | runcommand.invoke("mkdir inspect_data")
14 |
15 |
16 | def inspect_component(namespace):
17 | dir_name = "inspect_data/" + namespace + "-logs"
18 | if os.path.isdir(dir_name):
19 | runcommand.invoke("rm -R " + dir_name)
20 | logging.info("Deleted existing %s directory" % (dir_name))
21 | runcommand.invoke("oc adm inspect ns/" + namespace + " --dest-dir=" + dir_name + " | tr -d '\n'")
22 | logging.info("Inspecting namespace %s into %s" % (namespace, dir_name))
23 |
24 |
25 | def inspect_operator(operator):
26 | dir_name = "inspect_data/" + operator + "-logs.out"
27 | if os.path.isdir(dir_name):
28 | runcommand.invoke("rm -R " + dir_name)
29 | logging.info("Deleted existing %s directory" % (dir_name))
30 | runcommand.invoke("oc adm inspect clusteroperator/" + operator + " --dest-dir=" + dir_name)
31 | logging.info("Inspecting cluster operator %s into %s" % (operator, dir_name))
32 |
--------------------------------------------------------------------------------
/media/logo_assets/all_black/cerberus-logo_color-black-mark-only..svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/all_white/cerberus-logo_color-white-mark-only.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/node-problem-detector.md:
--------------------------------------------------------------------------------
1 | # Node Problem Detector
2 | [node-problem-detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in cluster management stack.
3 |
4 | ### Installation
5 | Please follow the instructions in the [installation](https://github.com/kubernetes/node-problem-detector#installation) section to setup Node Problem Detector on Kubernetes. The following instructions are setting it up on OpenShift:
6 |
7 | 1. Create `openshift-node-problem-detector` namespace [ns.yaml](https://github.com/openshift/node-problem-detector-operator/blob/master/deploy/ns.yaml) with `oc create -f ns.yaml`
8 | 2. Add cluster role with `oc adm policy add-cluster-role-to-user system:node-problem-detector -z default -n openshift-node-problem-detector`
9 | 3. Add security context constraints with `oc adm policy add-scc-to-user privileged system:serviceaccount:openshift-node-problem-detector:default
10 | `
11 | 4. Edit [node-problem-detector.yaml](https://github.com/kubernetes/node-problem-detector/blob/master/deployment/node-problem-detector.yaml) to fit your environment.
12 | 5. Edit [node-problem-detector-config.yaml](https://github.com/kubernetes/node-problem-detector/blob/master/deployment/node-problem-detector-config.yaml) to configure node-problem-detector.
13 | 6. Create the ConfigMap with `oc create -f node-problem-detector-config.yaml`
14 | 7. Create the DaemonSet with `oc create -f node-problem-detector.yaml`
15 |
16 | Once installed you will see node-problem-detector pods in openshift-node-problem-detector namespace.
17 | Now enable openshift-node-problem-detector in the [config.yaml](https://github.com/openshift-scale/cerberus/blob/master/config/config.yaml).
18 | Cerberus just monitors `KernelDeadlock` condition provided by the node problem detector as it is system critical and can hinder node performance.
19 |
--------------------------------------------------------------------------------
/docs/contribute.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | Contributions are always appreciated.
4 |
5 | How to:
6 | * [Submit Pull Request](#pull-request)
7 | * [Fix Formatting](#fix-formatting)
8 | * [Squash Commits](#squash-commits)
9 |
10 | ## Pull request
11 |
12 | In order to submit a change or a PR, please fork the project and follow instructions:
13 | ```bash
14 | $ git clone http://github.com//cerberus
15 | $ cd cerberus
16 | $ git checkout -b
17 | $
18 | $ git add
19 | $ git commit -a
20 | $
21 | $ git push
22 | ```
23 |
24 | ## Fix Formatting
25 | Cerberus uses [pre-commit](https://pre-commit.com) framework to maintain the code linting and python code styling.
26 | The CI would run the pre-commit check on each pull request.
27 | We encourage our contributors to follow the same pattern, while contributing to the code.
28 |
29 | The pre-commit configuration file is present in the repository `.pre-commit-config.yaml`
30 | It contains the different code styling and linting guide which we use for the application.
31 |
32 | Following command can be used to run the pre-commit:
33 | `pre-commit run --all-files`
34 |
35 | If pre-commit is not installed in your system, it can be install with : `pip install pre-commit`
36 |
37 | ## Squash Commits
38 | If there are mutliple commits, please rebase/squash multiple commits
39 | before creating the PR by following:
40 |
41 | ```bash
42 | $ git checkout
43 | $ git rebase -i HEAD~
44 | -OR-
45 | $ git rebase -i
46 | ```
47 |
48 | In the interactive rebase screen, set the first commit to `pick` and all others to `squash` (or whatever else you may need to do).
49 |
50 | Push your rebased commits (you may need to force), then issue your PR.
51 |
52 | ```
53 | $ git push origin --force
54 | ```
55 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_dark_background/cerberus-logo_color-dark-mark-only.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_light_background/cerberus-logo_color-light_mark-only.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/cerberus/prometheus/client.py:
--------------------------------------------------------------------------------
1 | import urllib3
2 | import logging
3 | import prometheus_api_client
4 | import cerberus.invoke.command as runcommand
5 |
6 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
7 |
8 |
9 | # Initialize the client
10 | def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
11 | global prom_cli
12 | if distribution == "openshift" and not prometheus_url:
13 | url = runcommand.invoke(
14 | r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa
15 | )
16 | prometheus_url = "https://" + url
17 | if distribution == "openshift" and not prometheus_bearer_token:
18 | prometheus_bearer_token = runcommand.invoke(
19 | "oc create token -n openshift-monitoring prometheus-k8s --duration=12h "
20 | "|| oc sa get-token -n openshift-monitoring prometheus-k8s || "
21 | "oc sa new-token -n openshift-monitoring prometheus-k8s"
22 | )
23 | if prometheus_url and prometheus_bearer_token:
24 | bearer = "Bearer " + prometheus_bearer_token
25 | headers = {"Authorization": bearer}
26 | try:
27 | prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
28 | except Exception as e:
29 | logging.error("Not able to initialize the client %s" % e)
30 | else:
31 | prom_cli = None
32 |
33 |
34 | # Process custom prometheus query
35 | def process_prom_query(query):
36 | if prom_cli:
37 | try:
38 | return prom_cli.custom_query(query=query, params=None)
39 | except Exception as e:
40 | logging.error("Failed to get the metrics: %s" % e)
41 | else:
42 | logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initilized\n")
43 |
--------------------------------------------------------------------------------
/containers/README.md:
--------------------------------------------------------------------------------
1 | ### Cerberus image
2 |
3 | Container image gets automatically built by quay.io at [Cerberus image](https://quay.io/repository/redhat-chaos/cerberus). The builds will be triggered by any commit pushed to this repository.
4 |
5 | ### Run containerized version
6 | Refer to the [instructions](https://github.com/cloud-bulldozer/cerberus/tree/master/containers/build_own_image-README.md) for information on how to build and run the containerized version of cerberus.
7 |
8 | ### Cerberus as a Kubernetes/OpenShift application
9 | To run containerized Cerberus as a Kubernetes/OpenShift Deployment, follow these steps:
10 | 1. Configure the [config.yaml](https://github.com/openshift-scale/cerberus/tree/master/config) file according to your requirements.
11 | 2. Create a namespace under which you want to run the cerberus pod using `kubectl create ns `.
12 | 3. Switch to `` namespace:
13 | - In Kubernetes, use `kubectl config set-context --current --namespace=`
14 | - In OpenShift, use `oc project `
15 | 4. Create a ConfigMap named kube-config using `kubectl create configmap kube-config --from-file=`
16 | 5. Create a ConfigMap named cerberus-config using `kubectl create configmap cerberus-config --from-file=`
17 | 6. Create a serviceaccount to run the cerberus pod with privileges using `kubectl create serviceaccount useroot`.
18 | - In Openshift, execute `oc adm policy add-scc-to-user privileged -z useroot`.
19 | 7. Create a Deployment and a NodePort Service using `kubectl apply -f cerberus.yml`
20 | 8. Accessing the go/no-go signal:
21 | - In Kubernetes, execute `kubectl port-forward --address 0.0.0.0 pod/ 8080:8080` and access the signal at `http://localhost:8080` and `http://:8080`.
22 | - In Openshift, create a route based on service cerberus-service using `oc expose service cerberus-service`. List all the routes using `oc get routes`. Use HOST/PORT associated with cerberus-service to access the signal.
23 |
24 | NOTE: It is not recommended to run Cerberus internal to the cluster as the pod which is running Cerberus might get disrupted.
25 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Code Quality Check and CI tests
2 |
3 | on:
4 | - push
5 | - pull_request
6 |
7 | env:
8 | COVERAGE_FILE: "${pwd}/.coverage"
9 |
10 | jobs:
11 | ci:
12 | runs-on: ubuntu-latest
13 | name: Run pre-commit, install test and CI tests
14 | steps:
15 | - name: Check out source repository
16 | uses: actions/checkout@v2
17 | - name: Install Python
18 | uses: actions/setup-python@v4
19 | with:
20 | python-version: '3.9'
21 | architecture: 'x64'
22 | - name: Install environment
23 | run: |
24 | pip install --upgrade pip
25 | pip install -r requirements.txt
26 | - name: Create multi-node KinD cluster
27 | uses: redhat-chaos/actions/kind@main
28 | # In CI we need to use daemon disabled or cerberus will never exit unless killed.
29 | - name: Run funtional test basic run with daemon disabled
30 | run: |
31 | cp ./config/kubernetes_config.yaml ./daemon_disabled_config.yaml
32 | sed -i "s/daemon_mode: True/daemon_mode: False/" ./daemon_disabled_config.yaml
33 | python3 -m coverage run -a start_cerberus.py -c ./daemon_disabled_config.yaml
34 | - name: Run functional test detailed data inspection with daemon disabled
35 | run: |
36 | cp ./daemon_disabled_config.yaml ./detailed_config.yaml
37 | sed -i "s/kube-system/test-namespace/" ./detailed_config.yaml
38 | sed -i "s/inspect_components: False/inspect_components: True/" ./detailed_config.yaml
39 | kubectl create ns test-namespace
40 | source CI/tests/common.sh
41 | export -f create_and_delete_pod
42 | parallel ::: "python3 start_cerberus.py -c ./detailed_config.yaml" create_and_delete_pod
43 | kubectl delete ns test-namespace
44 | - name: Publish coverage report to job summary
45 | run: |
46 | python3 -m coverage html
47 | pip install html2text
48 | html2text --ignore-images --ignore-links -b 0 htmlcov/index.html >> $GITHUB_STEP_SUMMARY
49 | - name: Upload coverage data
50 | uses: actions/upload-artifact@v4
51 | with:
52 | name: coverage
53 | path: htmlcov
54 | if-no-files-found: error
55 |
--------------------------------------------------------------------------------
/docs/slack.md:
--------------------------------------------------------------------------------
1 | # Slack Integration
2 |
3 | The user has the option to enable/disable the slack integration ( disabled by default ). To use the slack integration, the user has to first create an [app](https://api.slack.com/apps?new_granular_bot_app=1) and add a bot to it on slack. SLACK_API_TOKEN and SLACK_CHANNEL environment variables have to be set. SLACK_API_TOKEN refers to Bot User OAuth Access Token and SLACK_CHANNEL refers to the slack channel ID the user wishes to receive the notifications. Make sure the Slack Bot Token Scopes contains this permission [calls:read] [channels:read] [chat:write] [groups:read] [im:read] [mpim:read]
4 | - Reports when cerberus starts monitoring a cluster in the specified slack channel.
5 | - Reports the component failures in the slack channel.
6 | - A watcher can be assigned for each day of the week. The watcher of the day is tagged while reporting failures in the slack channel instead of everyone. (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
7 |
8 | #### Go or no-go signal
9 | When the cerberus is configured to run in the daemon mode, it will continuosly monitor the components specified, runs a simple http server at http://0.0.0.0:8080 and publishes the signal i.e True or False depending on the components status. The tools can consume the signal and act accordingly.
10 |
11 | #### Failures in a time window
12 | 1. The failures in the past 1 hour can be retrieved in the json format by visiting http://0.0.0.0:8080/history.
13 | 2. The failures in a specific time window can be retrieved in the json format by visiting http://0.0.0.0:8080/history?loopback=.
14 | 3. The failures between two time timestamps, the failures of specific issues types and the failures related to specific components can be retrieved in the json format by visiting http://0.0.0.0:8080/analyze url. The filters have to be applied to scrape the failures accordingly.
15 |
16 | #### Sample Slack Config
17 |
18 | This is a snippet of how would your slack config could look like within your `cerberus_config.yaml`.
19 |
20 | ```yaml
21 | watcher_slack_ID:
22 | Monday: U1234ABCD # replace with your Slack ID from Profile-> More -> Copy Member ID
23 | Tuesday: # Same or different ID can be used for remaining days depending on who you want to tag
24 | Wednesday:
25 | Thursday:
26 | Friday:
27 | Saturday:
28 | Sunday:
29 | slack_team_alias: @group_or_team_id
30 | ```
31 |
--------------------------------------------------------------------------------
/CI/run_ci.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | test_rc=0
4 | diff_list=`git diff --name-only origin/master`
5 | echo -e "List of files changed : ${diff_list} \n"
6 |
7 | test_list=`cat CI/tests/test_list`
8 |
9 | echo "running test suit consisting of ${test_list}"
10 |
11 | sed 's/.sh//g' CI/tests/test_list > CI/tests/my_tests
12 |
13 | # Prep the results.markdown file
14 | echo 'Test | Result | Duration' >> results.markdown
15 | echo '-----------------------|--------|---------' >> results.markdown
16 |
17 | # Create a "gold" directory based off the current branch
18 | rsync -av --progress `pwd`/ `pwd`/gold
19 |
20 | # Create individual directories for each test
21 | for ci_dir in `cat CI/tests/my_tests`
22 | do
23 | rsync -av --progress `pwd`/gold/ `pwd`/$ci_dir
24 | done
25 |
26 | ./CI/master_test.sh
27 |
28 | # Run each test
29 | for test_name in `cat CI/tests/my_tests`
30 | do
31 | ./CI/run_test.sh $test_name
32 | done
33 |
34 | # Update markdown file
35 | for test_dir in `cat CI/tests/my_tests`
36 | do
37 | cat $test_dir/results.markdown >> results.markdown
38 | cat $test_dir/ci_results >> ci_results
39 | done
40 |
41 | if [[ -d "test_daemon_disabled" ]]; then
42 | echo "" >> results.markdown
43 | echo 'Check | Gold time (s) | PR time (s) | % Change' >> results.markdown
44 | echo '------|---------------|-------------|---------' >> results.markdown
45 | checks_in_pr=`jq '.Average | keys | .[]' test_daemon_disabled/time_tracker.json`
46 | checks_in_master=`jq '.Average | keys | .[]' tmp/master_time_tracker.json`
47 | for check in $checks_in_pr; do
48 | pr_time=$(jq -r ".Average[$check]" test_daemon_disabled/time_tracker.json);
49 | if [[ `echo $checks_in_master | grep -w $check` ]];
50 | then
51 | gold_time=$(jq -r ".Average[$check]" tmp/master_time_tracker.json);
52 | delta=$(bc -l <<<"scale=2; (${pr_time}-${gold_time})/(${gold_time}*0.01)")
53 | gold_time=$(bc -l <<<"scale=6; ${gold_time}/1")
54 | pr_time=$(bc -l <<<"scale=6; ${pr_time}/1")
55 | echo "$check | $gold_time | $pr_time | $delta" >> results.markdown
56 | else
57 | pr_time=$(bc -l <<<"scale=6; ${pr_time}/1")
58 | echo "$check | | $pr_time | " >> results.markdown
59 | fi
60 | done
61 | fi
62 |
63 | # Get number of successes/failures
64 | testcount=`wc -l ci_results`
65 | success=`grep Successful ci_results | awk -F ":" '{print $1}'`
66 | failed=`grep Failed ci_results | awk -F ":" '{print $1}'`
67 | failcount=`grep -c Failed ci_results`
68 |
69 | if [ `grep -c Failed ci_results` -gt 0 ]
70 | then
71 | test_rc=1
72 | fi
73 |
74 | # Clean up our created directories
75 | rm -rf gold test_* ci_results
76 |
77 | cat results.markdown
78 |
79 | exit $test_rc
80 |
--------------------------------------------------------------------------------
/docs/example_report.md:
--------------------------------------------------------------------------------
1 | # Example Report
2 |
3 | ```
4 | 2020-03-26 22:05:06,393 [INFO] Starting ceberus
5 | 2020-03-26 22:05:06,401 [INFO] Initializing client to talk to the Kubernetes cluster
6 | 2020-03-26 22:05:06,434 [INFO] Fetching cluster info
7 | 2020-03-26 22:05:06,739 [INFO] Publishing cerberus status at http://0.0.0.0:8080
8 | 2020-03-26 22:05:06,753 [INFO] Starting http server at http://0.0.0.0:8080
9 | 2020-03-26 22:05:06,753 [INFO] Daemon mode enabled, cerberus will monitor forever
10 | 2020-03-26 22:05:06,753 [INFO] Ignoring the iterations set
11 |
12 | 2020-03-26 22:05:25,104 [INFO] Iteration 4: Node status: True
13 | 2020-03-26 22:05:25,133 [INFO] Iteration 4: Etcd member pods status: True
14 | 2020-03-26 22:05:25,161 [INFO] Iteration 4: OpenShift apiserver status: True
15 | 2020-03-26 22:05:25,546 [INFO] Iteration 4: Kube ApiServer status: True
16 | 2020-03-26 22:05:25,717 [INFO] Iteration 4: Monitoring stack status: True
17 | 2020-03-26 22:05:25,720 [INFO] Iteration 4: Kube controller status: True
18 | 2020-03-26 22:05:25,746 [INFO] Iteration 4: Machine API components status: True
19 | 2020-03-26 22:05:25,945 [INFO] Iteration 4: Kube scheduler status: True
20 | 2020-03-26 22:05:25,963 [INFO] Iteration 4: OpenShift ingress status: True
21 | 2020-03-26 22:05:26,077 [INFO] Iteration 4: OpenShift SDN status: True
22 | 2020-03-26 22:05:26,077 [INFO] HTTP requests served: 0
23 | 2020-03-26 22:05:26,077 [INFO] Sleeping for the specified duration: 5
24 |
25 |
26 | 2020-03-26 22:05:31,134 [INFO] Iteration 5: Node status: True
27 | 2020-03-26 22:05:31,162 [INFO] Iteration 5: Etcd member pods status: True
28 | 2020-03-26 22:05:31,190 [INFO] Iteration 5: OpenShift apiserver status: True
29 | 127.0.0.1 - - [26/Mar/2020 22:05:31] "GET / HTTP/1.1" 200 -
30 | 2020-03-26 22:05:31,588 [INFO] Iteration 5: Kube ApiServer status: True
31 | 2020-03-26 22:05:31,759 [INFO] Iteration 5: Monitoring stack status: True
32 | 2020-03-26 22:05:31,763 [INFO] Iteration 5: Kube controller status: True
33 | 2020-03-26 22:05:31,788 [INFO] Iteration 5: Machine API components status: True
34 | 2020-03-26 22:05:31,989 [INFO] Iteration 5: Kube scheduler status: True
35 | 2020-03-26 22:05:32,007 [INFO] Iteration 5: OpenShift ingress status: True
36 | 2020-03-26 22:05:32,118 [INFO] Iteration 5: OpenShift SDN status: False
37 | 2020-03-26 22:05:32,118 [INFO] HTTP requests served: 1
38 | 2020-03-26 22:05:32,118 [INFO] Sleeping for the specified duration: 5
39 | +--------------------------------------------------Failed Components--------------------------------------------------+
40 | 2020-03-26 22:05:37,123 [INFO] Failed openshfit sdn components: ['sdn-xmqhd']
41 |
42 | 2020-05-23 23:26:43,041 [INFO] ------------------------- Iteration Stats ---------------------------------------------
43 | 2020-05-23 23:26:43,041 [INFO] Time taken to run watch_nodes in iteration 1: 0.0996248722076416 seconds
44 | 2020-05-23 23:26:43,041 [INFO] Time taken to run watch_cluster_operators in iteration 1: 0.3672499656677246 seconds
45 | 2020-05-23 23:26:43,041 [INFO] Time taken to run watch_namespaces in iteration 1: 1.085144281387329 seconds
46 | 2020-05-23 23:26:43,041 [INFO] Time taken to run entire_iteration in iteration 1: 4.107403039932251 seconds
47 | 2020-05-23 23:26:43,041 [INFO] ---------------------------------------------------------------------------------------
48 | ```
49 |
--------------------------------------------------------------------------------
/cerberus/server/server.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import logging
3 | import _thread
4 | import cerberus.database.client as dbcli
5 | from urllib.parse import urlparse, parse_qsl
6 | from http.server import HTTPServer, BaseHTTPRequestHandler
7 |
8 |
9 | # Start a simple http server to publish the cerberus status file content
10 | class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
11 | requests_served = 0
12 |
13 | def do_GET(self):
14 | if self.path == "/":
15 | self.do_status()
16 | elif self.path.startswith("/history"):
17 | self.do_history()
18 | elif self.path == "/analyze":
19 | self.do_analyze()
20 | elif self.path.startswith("/analysis"):
21 | self.do_analysis()
22 |
23 | def do_status(self):
24 | self.send_response(200)
25 | self.end_headers()
26 | f = open("/tmp/cerberus_status", "rb")
27 | self.wfile.write(f.read())
28 | SimpleHTTPRequestHandler.requests_served = SimpleHTTPRequestHandler.requests_served + 1
29 |
30 | def do_history(self):
31 | parsed = urlparse(self.path)
32 | try:
33 | loopback = int(float(dict(parse_qsl(parsed.query))["loopback"]) * 60)
34 | except Exception:
35 | loopback = 3600
36 | try:
37 | dbcli.query(loopback)
38 | self.send_response(200)
39 | self.end_headers()
40 | f = open("./history/cerberus_history.json", "rb")
41 | self.wfile.write(f.read())
42 | except Exception as e:
43 | self.send_error(404, "Encountered the following error: %s. Please retry" % e)
44 |
45 | def do_analyze(self):
46 | try:
47 | self.send_response(200)
48 | self.end_headers()
49 | f = open("./history/analysis.html", "rb")
50 | self.wfile.write(f.read())
51 | except Exception as e:
52 | self.send_error(404, "Encountered the following error: %s. Please retry" % e)
53 |
54 | def do_analysis(self):
55 | formdata = dict(parse_qsl(urlparse(self.path).query, keep_blank_values=True))
56 | for key in ["issue", "name", "component"]:
57 | formdata[key] = formdata[key].strip().split(",")
58 | if not formdata[key]:
59 | formdata[key] = ()
60 | else:
61 | formdata[key] = tuple(value.strip() for value in formdata[key] if value.strip())
62 | try:
63 | dbcli.custom_query(formdata)
64 | self.send_response(200)
65 | self.end_headers()
66 | f = open("./history/cerberus_analysis.json", "rb")
67 | self.wfile.write(f.read())
68 | except Exception as e:
69 | self.send_error(404, "Encountered the following error: %s. Please retry" % e)
70 |
71 |
72 | def start_server(address):
73 | server = address[0]
74 | port = address[1]
75 | httpd = HTTPServer(address, SimpleHTTPRequestHandler)
76 | logging.info("Starting http server at http://%s:%s\n" % (server, port))
77 | try:
78 | _thread.start_new_thread(httpd.serve_forever, ())
79 | except Exception:
80 | logging.error(
81 | "Failed to start the http server \
82 | at http://%s:%s"
83 | % (server, port)
84 | )
85 | sys.exit(1)
86 |
--------------------------------------------------------------------------------
/cerberus/slack/slack_client.py:
--------------------------------------------------------------------------------
1 | import os
2 | import slack_sdk
3 | import logging
4 | import datetime
5 | import cerberus.invoke.command as runcommand
6 |
7 |
8 | # Load env variables and initialize slack python client
9 | def initialize_slack_client():
10 | try:
11 | global slack_reporter_token, slack_channel_ID, slack_client
12 | slack_reporter_token = os.environ["SLACK_API_TOKEN"]
13 | slack_channel_ID = os.environ["SLACK_CHANNEL"]
14 | slack_client = slack_sdk.WebClient(token=slack_reporter_token)
15 | logging.info("Slack integration has been enabled")
16 | return True
17 | except Exception as e:
18 | logging.error("Couldn't create slack WebClient. Check if slack env " "varaibles are set. Exception: %s" % (e))
19 | logging.info("Slack integration has been disabled")
20 | return False
21 |
22 |
23 | # Post messages and failures in slack
24 | def post_message_in_slack(slack_message, thread_ts=None):
25 | slack_client.chat_postMessage(channel=slack_channel_ID, link_names=True, text=slack_message, thread_ts=thread_ts)
26 |
27 |
28 | # Get members of a channel
29 | def get_channel_members():
30 | return slack_client.conversations_members(channel=slack_channel_ID)
31 |
32 |
33 | # slack tag to be used while reporitng in slack channel
34 | def slack_tagging(watcher_slack_member_ID, slack_team_alias):
35 | global slack_tag, valid_watchers
36 | valid_watchers = get_channel_members()["members"]
37 | if watcher_slack_member_ID in valid_watchers:
38 | slack_tag = "<@" + watcher_slack_member_ID + ">"
39 | elif slack_team_alias:
40 | slack_tag = "@" + slack_team_alias + " "
41 | else:
42 | slack_tag = ""
43 |
44 |
45 | # Report the start of cerberus cluster monitoring in slack channel
46 | def slack_report_cerberus_start(cluster_info, weekday, watcher_slack_member_ID):
47 | response = slack_client.chat_postMessage(
48 | channel=slack_channel_ID,
49 | link_names=True,
50 | text="%s Cerberus has started monitoring! " ":skull_and_crossbones: %s" % (slack_tag, cluster_info),
51 | ) # noqa
52 | global thread_ts
53 | thread_ts = response["ts"]
54 | if watcher_slack_member_ID in valid_watchers:
55 | post_message_in_slack("Hi " + slack_tag + "! The watcher for " + weekday + "!\n", thread_ts)
56 |
57 |
58 | # Report the nodes and namespace failures in slack channel
59 | def slack_logging(
60 | cluster_info,
61 | iteration,
62 | watch_nodes_status,
63 | failed_nodes,
64 | watch_cluster_operators_status,
65 | failed_operators,
66 | watch_namespaces_status,
67 | failed_pods_components,
68 | custom_checks_status,
69 | custom_checks_fail_messages,
70 | ):
71 | issues = []
72 | cerberus_report_path = runcommand.invoke("pwd | tr -d '\n'")
73 | if not watch_nodes_status:
74 | issues.append("*nodes: " + ", ".join(failed_nodes) + "*")
75 | if not watch_cluster_operators_status:
76 | issues.append("*cluster operators: " + ", ".join(failed_operators) + "*")
77 | if not watch_namespaces_status:
78 | issues.append("*namespaces: " + ", ".join(list(failed_pods_components.keys())) + "*")
79 | if not custom_checks_status:
80 | issues.append("*custom_checks: " + ", ".join(custom_checks_fail_messages) + "*")
81 | issues = "\n".join(issues)
82 | post_message_in_slack(
83 | slack_tag + " %sIn iteration %d at %s, Cerberus "
84 | "found issues in: \n%s \nHence, setting the "
85 | "go/no-go signal to false. \nThe full report "
86 | "is at *%s* on the host cerberus is running."
87 | % (
88 | cluster_info,
89 | iteration,
90 | datetime.datetime.now().replace(microsecond=0).isoformat(),
91 | issues,
92 | cerberus_report_path,
93 | ),
94 | thread_ts,
95 | )
96 |
--------------------------------------------------------------------------------
/config/kubernetes_config.yaml:
--------------------------------------------------------------------------------
1 | cerberus:
2 | distribution: kubernetes # Distribution can be kubernetes or openshift
3 | kubeconfig_path: ~/.kube/config # Path to kubeconfig
4 | watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes
5 | watch_cluster_operators: False # Set to True for cerberus to monitor cluster operators. Enable it only when distribution is openshift
6 | watch_url_routes: # Route url's you want to monitor, this is a double array with the url and optional authorization parameter
7 | watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label.
8 | enabled: True
9 | label: node-role.kubernetes.io/control-plane
10 | watch_namespaces: # List of namespaces to be monitored
11 | - kube-system
12 | watch_namespaces_ignore_pattern: [] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces
13 | cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status
14 | inspect_components: False # Enable it only when OpenShift client is supported to run
15 | # When enabled, cerberus collects logs, events and metrics of failed components
16 |
17 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
18 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
19 | # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies.
20 |
21 | slack_integration: False # When enabled, cerberus reports the failed iterations in the slack channel
22 | # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures )
23 | # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's.
24 | watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
25 | Monday:
26 | Tuesday:
27 | Wednesday:
28 | Thursday:
29 | Friday:
30 | Saturday:
31 | Sunday:
32 | slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
33 |
34 | tunings:
35 | timeout: 60 # Number of seconds before requests fail
36 | iterations: 5 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled
37 | sleep_time: 60 # Sleep duration between each iteration
38 | daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever
39 |
40 | database:
41 | database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored
42 | reuse_database: False # When enabled, the database is reused to store the failures
43 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 |
3 | Following ways are supported to run Cerberus:
4 |
5 | - Standalone python program through Git or python package
6 | - Containerized version using either Podman or Docker as the runtime
7 | - Kubernetes or OpenShift deployment
8 |
9 | **NOTE**: Only OpenShift 4.x versions are tested.
10 |
11 |
12 | ## Git
13 | Pick the latest stable release to install [here](https://github.com/redhat-chaos/cerberus/releases/).
14 | ```
15 | $ git clone https://github.com/redhat-chaos/cerberus.git --branch
16 | ```
17 |
18 | ### Install the dependencies
19 | **NOTE**: Recommended to use a virtual environment(pyenv,venv) so as to prevent conflicts with already installed packages.
20 | ```
21 | $ pip3 install -r requirements.txt
22 | ```
23 |
24 | ### Configure and Run
25 | Setup the [config](https://github.com/redhat-chaos/cerberus/tree/master/config) according to your requirements. Information on the available options can be found at [usage](usage.md).
26 |
27 | #### Run
28 | ```
29 | $ python3 start_cerberus.py --config
30 | ```
31 |
32 | **NOTE**: When config file location is not passed, default [config](https://github.com/redhat-chaos/cerberus/tree/master/config) is used.
33 |
34 |
35 | ## Python Package
36 | Cerberus is also available as a python package to ease the installation and setup.
37 |
38 | To install the lastest release:
39 |
40 | ```
41 | $ pip3 install cerberus-client
42 | ```
43 |
44 | ### Configure and Run
45 | Setup the [config](https://github.com/redhat-chaos/cerberus/tree/master/config) according to your requirements. Information on the available options can be found at [usage](usage.md).
46 |
47 | #### Run
48 | ```
49 | $ cerberus_client -c `
50 | ```
51 |
52 | **NOTE**: When config_file_location is not passed, default [config](https://github.com/redhat-chaos/cerberus/tree/master/config) is used.
53 | **NOTE**: It's recommended to run Cerberus either using the containerized or github version to be able to use the latest enhancements and fixes.
54 |
55 | ## Containerized version
56 |
57 | Assuming docker ( 17.05 or greater with multi-build support ) is intalled on the host, run:
58 | ```
59 | $ docker pull quay.io/redhat-chaos/cerberus
60 | # Setup the [config](https://github.com/redhat-chaos/cerberus/tree/master/config) according to your requirements. Information on the available options can be found at [usage](usage.md).
61 | $ docker run --name=cerberus --net=host -v :/root/.kube/config -v :/root/cerberus/config/config.yaml -d quay.io/redhat-chaos/cerberus:latest
62 | $ docker logs -f cerberus
63 | ```
64 |
65 | Similarly, podman can be used to achieve the same:
66 | ```
67 | $ podman pull quay.io/redhat-chaos/cerberus
68 | # Setup the [config](https://github.com/redhat-chaos/cerberus/tree/master/config) according to your requirements. Information on the available options can be found at [usage](usage.md).
69 | $ podman run --name=cerberus --net=host -v :/root/.kube/config:Z -v :/root/cerberus/config/config.yaml:Z -d quay.io/redhat-chaos/cerberus:latest
70 | $ podman logs -f cerberus
71 | ```
72 |
73 | The go/no-go signal ( True or False ) gets published at http://``:8080. Note that the cerberus will only support ipv4 for the time being.
74 |
75 | **NOTE**: The report is generated at /root/cerberus/cerberus.report inside the container, it can mounted to a directory on the host in case we want to capture it.
76 |
77 | If you want to build your own Cerberus image, see [here](https://github.com/redhat-chaos/cerberus/tree/master/containers/build_own_image-README.md).
78 | To run Cerberus on Power (ppc64le) architecture, build and run a containerized version by following the instructions given [here](https://github.com/cloud-bulldozer/cerberus/tree/master/containers/build_own_image-README.md).
79 |
80 | ## Run containerized Cerberus as a Kubernetes/OpenShift deployment
81 | Refer to the [instructions](https://github.com/redhat-chaos/cerberus/blob/master/containers/README.md#cerberus-as-a-kubernetesopenshift-application) for information on how to run cerberus as a Kubernetes or OpenShift application.
82 |
--------------------------------------------------------------------------------
/cerberus/database/client.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import json
4 | import sqlite3
5 | from datetime import datetime
6 | import cerberus.invoke.command as runcommand
7 |
8 |
9 | def get_time(timestamp):
10 | return int(time.mktime(datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S").timetuple()))
11 |
12 |
13 | def set_db_path(database_path):
14 | global db_path
15 | db_path = database_path
16 |
17 |
18 | def create_db():
19 | if os.path.isfile(db_path):
20 | runcommand.invoke("rm " + db_path)
21 | sqlite3.connect(db_path)
22 |
23 |
24 | def create_table():
25 | connection = sqlite3.connect(db_path)
26 | crsr = connection.cursor()
27 | command = """create table Failures (
28 | timestamp timestamp,
29 | time integer,
30 | count integer,
31 | issue text,
32 | name text,
33 | component text);"""
34 | crsr.execute(command)
35 | connection.commit()
36 |
37 |
38 | def insert(timestamp, time, count, issue, names, component):
39 | connection = sqlite3.connect(db_path)
40 | crsr = connection.cursor()
41 | timestamp = timestamp.replace(microsecond=0)
42 | time = int(time)
43 | for name in names:
44 | crsr.execute("insert into Failures values (?, ?, ?, ?, ?, ?)", (timestamp, time, count, issue, name, component))
45 | connection.commit()
46 |
47 |
48 | def query(loopback):
49 | connection = sqlite3.connect(db_path)
50 | crsr = connection.cursor()
51 | finish_time = int(time.time())
52 | start_time = finish_time - loopback
53 | command = "select timestamp, count, issue, name, component from Failures where " "time >= " + str(
54 | start_time
55 | ) + " and time <= " + str(finish_time)
56 | crsr.execute(command)
57 | fetched_data = crsr.fetchall()
58 | create_json(fetched_data, "cerberus_history.json")
59 |
60 |
61 | def custom_query(filters):
62 | connection = sqlite3.connect(db_path)
63 | crsr = connection.cursor()
64 | start_time = ""
65 | finish_time = ""
66 | sdate = filters.get("sdate", "")
67 | stime = filters.get("stime", "")
68 | fdate = filters.get("fdate", "")
69 | ftime = filters.get("ftime", "")
70 | issue = filters.get("issue", ())
71 | name = filters.get("name", ())
72 | component = filters.get("component", ())
73 |
74 | if sdate and not stime:
75 | stime = "00:00:00"
76 | if fdate and not ftime:
77 | ftime = "23:59:59"
78 |
79 | if sdate:
80 | start_time = sdate + " " + stime
81 | start_time = get_time(start_time)
82 | if fdate:
83 | finish_time = fdate + " " + ftime
84 | finish_time = get_time(finish_time)
85 |
86 | command = "select timestamp, count, issue, name, component from Failures where "
87 |
88 | if start_time and finish_time:
89 | command += "time >= " + str(start_time) + " and time <= " + str(finish_time) + " and "
90 | elif start_time:
91 | command += "time >= " + str(start_time) + " and "
92 | elif finish_time:
93 | command += "time <= " + str(finish_time) + " and "
94 |
95 | if issue:
96 | command += "issue in " + str(issue + ("",)) + " and "
97 | if name:
98 | command += "name in " + str(name + ("",)) + " and "
99 | if component:
100 | command += "component in " + str(component + ("",)) + " and "
101 |
102 | command = command.strip().rsplit(" ", 1)[0]
103 |
104 | crsr.execute(command)
105 | fetched_data = crsr.fetchall()
106 |
107 | create_json(fetched_data, "cerberus_analysis.json")
108 |
109 |
110 | def create_json(fetched_data, file_name):
111 | failures = []
112 | for data in fetched_data:
113 | failure = {
114 | "timestamp": data[0],
115 | "count": data[1],
116 | "issue": data[2],
117 | "name": data[3],
118 | "component": data[4],
119 | }
120 | failures.append(failure)
121 |
122 | history = {"history": {"failures": failures}}
123 |
124 | with open("./history/" + file_name, "w+") as file:
125 | json.dump(history, file, indent=4, separators=(",", ": "))
126 |
--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
1 | cerberus:
2 | distribution: openshift # Distribution can be kubernetes or openshift
3 | kubeconfig_path: ~/.kube/config # Path to kubeconfig
4 | port: 8080 # http server port where cerberus status is published
5 | watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes
6 | watch_cluster_operators: True # Set to True for cerberus to monitor cluster operators
7 | watch_url_routes: # Route url's you want to monitor, this is a double array with the url and optional authorization parameter
8 | watch_terminating_namespaces: True # Set to True to monitor if any namespaces in the 'watch_namespaces' list start terminating
9 | watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label.
10 | enabled: True
11 | label: node-role.kubernetes.io/master
12 | watch_namespaces: # List of namespaces to be monitored
13 | - openshift-etcd
14 | - openshift-apiserver
15 | - openshift-kube-apiserver
16 | - openshift-monitoring
17 | - openshift-kube-controller-manager
18 | - openshift-machine-api
19 | - openshift-kube-scheduler
20 | - openshift-ingress
21 | - openshift-ovn-kubernetes # When enabled, it will check for the cluster sdn and monitor that namespace
22 | watch_namespaces_ignore_pattern: [^installer*] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces
23 | cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status
24 | inspect_components: False # Enable it only when OpenShift client is supported to run
25 | # When enabled, cerberus collects logs, events and metrics of failed components
26 |
27 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
28 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
29 | # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies.
30 |
31 | slack_integration: False # When enabled, cerberus reports the failed iterations in the slack channel
32 | # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures )
33 | # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's.
34 | watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
35 | Monday:
36 | Tuesday:
37 | Wednesday:
38 | Thursday:
39 | Friday:
40 | Saturday:
41 | Sunday:
42 | slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
43 |
44 | custom_checks: # Relative paths of files conataining additional user defined checks
45 |
46 | tunings:
47 | timeout: 60 # Number of seconds before requests fail
48 | iterations: 5 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled
49 | sleep_time: 60 # Sleep duration between each iteration
50 | kube_api_request_chunk_size: 250 # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness.
51 | daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever
52 | cores_usage_percentage: 0.5 # Set the fraction of cores to be used for multiprocessing
53 |
54 | database:
55 | database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored
56 | reuse_database: False # When enabled, the database is reused to store the failures
57 |
--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | ### Config
4 | Set the supported components to monitor and the tunings like number of iterations to monitor and duration to wait between each check in the config file located at config/config.yaml. A sample config looks like:
5 |
6 | ```
7 | cerberus:
8 | distribution: openshift # Distribution can be kubernetes or openshift
9 | kubeconfig_path: ~/.kube/config # Path to kubeconfig
10 | port: 8080 # http server port where cerberus status is published
11 | watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes
12 | watch_cluster_operators: True # Set to True for cerberus to monitor cluster operators. Parameter is optional, will set to True if not specified
13 | watch_url_routes: # Route url's you want to monitor
14 | - - https://...
15 | - Bearer **** # This parameter is optional, specify authorization need for get call to route
16 | - - http://...
17 | watch_master_schedulable: # When enabled checks for the schedulable
18 | enabled: True master nodes with given label.
19 | label: node-role.kubernetes.io/master
20 | watch_namespaces: # List of namespaces to be monitored
21 | - openshift-etcd
22 | - openshift-apiserver
23 | - openshift-kube-apiserver
24 | - openshift-monitoring
25 | - openshift-kube-controller-manager
26 | - openshift-machine-api
27 | - openshift-kube-scheduler
28 | - openshift-ingress
29 | - openshift-sdn
30 | cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status
31 | inspect_components: False # Enable it only when OpenShift client is supported to run.
32 | # When enabled, cerberus collects logs, events and metrics of failed components
33 |
34 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
35 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
36 | # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies.
37 |
38 | slack_integration: False # When enabled, cerberus reports status of failed iterations in the slack channel
39 | # The following env vars need to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures )
40 | # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's.
41 | watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
42 | Monday:
43 | Tuesday:
44 | Wednesday:
45 | Thursday:
46 | Friday:
47 | Saturday:
48 | Sunday:
49 | slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
50 |
51 | custom_checks: # Relative paths of files conataining additional user defined checks
52 | - custom_checks/custom_check_sample.py
53 | - custom_check.py
54 |
55 | tunings:
56 | iterations: 5 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled
57 | sleep_time: 60 # Sleep duration between each iteration
58 | kube_api_request_chunk_size: 250 # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness.
59 | daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever
60 | cores_usage_percentage: 0.5 # Set the fraction of cores to be used for multiprocessing
61 |
62 | database:
63 | database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored
64 | reuse_database: False # When enabled, the database is reused to store the failures
65 | ```
66 | **NOTE**: watch_namespaces support regex patterns. Any valid regex pattern can be used to watch all the namespaces matching the regex pattern. For example, `^openshift-.*$` can be used to watch all namespaces that start with `openshift-` or `openshift` can be used to watch all namespaces that have `openshift` in it.
67 |
68 | **NOTE**: The current implementation can monitor only one cluster from one host. It can be used to monitor multiple clusters provided multiple instances of Cerberus are launched on different hosts.
69 |
70 | **NOTE**: The components especially the namespaces needs to be changed depending on the distribution i.e Kubernetes or OpenShift. The default specified in the config assumes that the distribution is OpenShift. A config file for Kubernetes is located at config/kubernetes_config.yaml
71 |
--------------------------------------------------------------------------------
/media/logo_assets/all_black/cerberus-logo_color-black-full-stacked.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/all_black/cerberus-logo_color-black-full-horizontal.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/all_white/cerberus-logo_color-white-full-stacked.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/all_white/cerberus-logo_color-black-full-horiszontal.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_dark_background/cerberus-logo_color-dark-full-stacked.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_light_background/cerberus-logo_color-light-full-stacked.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_dark_background/cerberus-logo_color-dark-full-horizontal.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/media/logo_assets/full_color/over_light_background/cerberus-logo_color-light-full-horizontal.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cerberus
2 | Guardian of Kubernetes and OpenShift Clusters
3 |
4 | 
5 |
6 | Cerberus watches the Kubernetes/OpenShift clusters for dead nodes, system component failures/health and exposes a go or no-go signal which can be consumed by other workload generators or applications in the cluster and act accordingly.
7 |
8 | You can find more information about cerberus, how to run and config options in https://krkn-chaos.dev/docs/cerberus/
9 |
10 | ### Workflow
11 | 
12 |
13 |
14 | ### Installation
15 | Instructions on how to setup, configure and run Cerberus can be found at [Installation](docs/installation.md).
16 |
17 |
18 |
19 | ### What Kubernetes/OpenShift components can Cerberus monitor?
20 | Following are the components of Kubernetes/OpenShift that Cerberus can monitor today, we will be adding more soon.
21 |
22 | Component | Description | Working
23 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------- | ------------------------- |
24 | Nodes | Watches all the nodes including masters, workers as well as nodes created using custom MachineSets | :heavy_check_mark: |
25 | Namespaces | Watches all the pods including containers running inside the pods in the namespaces specified in the config | :heavy_check_mark: |
26 | Cluster Operators | Watches all Cluster Operators | :heavy_check_mark: |
27 | Masters Schedulability | Watches and warns if masters nodes are marked as schedulable | :heavy_check_mark: |
28 | Routes | Watches specified routes | :heavy_check_mark: |
29 | CSRs | Warns if any CSRs are not approved | :heavy_check_mark: |
30 | Critical Alerts | Warns the user on observing abnormal behavior which might effect the health of the cluster | :heavy_check_mark: |
31 | Bring your own checks | Users can bring their own checks and Ceberus runs and includes them in the reporting as wells as go/no-go signal | :heavy_check_mark: |
32 |
33 | An explanation of all the components that Cerberus can monitor are explained [here](docs/config.md)
34 |
35 | ### How does Cerberus report cluster health?
36 | Cerberus exposes the cluster health and failures through a go/no-go signal, report and metrics API.
37 |
38 | #### Go or no-go signal
39 | When the cerberus is configured to run in the daemon mode, it will continuosly monitor the components specified, runs a light weight http server at http://0.0.0.0:8080 and publishes the signal i.e True or False depending on the components status. The tools can consume the signal and act accordingly.
40 |
41 | #### Report
42 | The report is generated in the run directory and it contains the information about each check/monitored component status per iteration with timestamps. It also displays information about the components in case of failure. Refer [report](docs/example_report.md) for example.
43 |
44 | You can use the "-o " option to change the location of the created report
45 |
46 | #### Metrics API
47 | Cerberus exposes the metrics including the failures observed during the run through an API. Tools consuming Cerberus can query the API to get a blob of json with the observed failures to scrape and act accordingly. For example, we can query for etcd failures within a start and end time and take actions to determine pass/fail for test cases or report whether the cluster is healthy or unhealthy for that duration.
48 |
49 | - The failures in the past 1 hour can be retrieved in the json format by visiting http://0.0.0.0:8080/history.
50 | - The failures in a specific time window can be retrieved in the json format by visiting http://0.0.0.0:8080/history?loopback=.
51 | - The failures between two time timestamps, the failures of specific issues types and the failures related to specific components can be retrieved in the json format by visiting http://0.0.0.0:8080/analyze url. The filters have to be applied to scrape the failures accordingly.
52 |
53 |
54 |
55 | ### Slack integration
56 | Cerberus supports reporting failures in slack. Refer [slack integration](docs/slack.md) for information on how to set it up.
57 |
58 |
59 |
60 | ### Node Problem Detector
61 | Cerberus also consumes [node-problem-detector](https://github.com/kubernetes/node-problem-detector) to detect various failures in Kubernetes/OpenShift nodes. More information on setting it up can be found at [node-problem-detector](docs/node-problem-detector.md)
62 |
63 |
64 |
65 | ### Bring your own checks
66 | Users can add additional checks to monitor components that are not being monitored by Cerberus and consume it as part of the go/no-go signal. This can be accomplished by placing relative paths of files containing additional checks under custom_checks in config file. All the checks should be placed within the main function of the file. If the additional checks need to be considered in determining the go/no-go signal of Cerberus, the main function can return a boolean value for the same. Having a dict return value of the format {'status':status, 'message':message} shall send signal to Cerberus along with message to be displayed in slack notification. However, it's optional to return a value.
67 | Refer to [example_check](https://github.com/openshift-scale/cerberus/blob/master/custom_checks/custom_check_sample.py) for an example custom check file.
68 |
69 |
70 | ### Alerts
71 | Monitoring metrics and alerting on abnormal behavior is critical as they are the indicators for clusters health. Information on supported alerts can be found at [alerts](docs/alerts.md).
72 |
73 |
74 |
75 | ### Use cases
76 | There can be number of use cases, here are some of them:
77 | - We run tools to push the limits of Kubernetes/OpenShift to look at the performance and scalability. There are a number of instances where system components or nodes start to degrade, which invalidates the results and the workload generator continues to push the cluster until it is unrecoverable.
78 |
79 | - When running chaos experiments on a kubernetes/OpenShift cluster, they can potentially break the components unrelated to the targeted components which means that the chaos experiment won't be able to find it. The go/no-go signal can be used here to decide whether the cluster recovered from the failure injection as well as to decide whether to continue with the next chaos scenario.
80 |
81 |
82 |
83 | ### Tools consuming Cerberus
84 | - [Benchmark Operator](https://github.com/cloud-bulldozer/benchmark-operator): The intent of this Operator is to deploy common workloads to establish a performance baseline of Kubernetes cluster on your provider. Benchmark Operator consumes Cerberus to determine if the cluster was healthy during the benchmark run. More information can be found at [cerberus-integration](https://github.com/cloud-bulldozer/benchmark-operator#cerberus-integration).
85 |
86 | - [Kraken](https://github.com/openshift-scale/kraken/): Tool to inject deliberate failures into Kubernetes/OpenShift clusters to check if it is resilient. Kraken consumes Cerberus to determine if the cluster is healthy as a whole in addition to the targeted component during chaos testing. More information can be found at [cerberus-integration](https://github.com/openshift-scale/kraken#kraken-scenario-passfail-criteria-and-report).
87 |
88 |
89 |
90 | ### Blogs and other useful resources
91 | - https://www.openshift.com/blog/openshift-scale-ci-part-4-introduction-to-cerberus-guardian-of-kubernetes/openshift-clouds
92 | - https://www.openshift.com/blog/reinforcing-cerberus-guardian-of-openshift/kubernetes-clusters
93 |
94 |
95 |
96 | ### Contributions
97 | We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
98 |
99 | [More information on how to Contribute](docs/contribute.md)
100 |
101 | ### Community
102 | Key Members(slack_usernames): paige, rook, mffiedler, mohit, dry923, rsevilla, ravi
103 | * [**#sig-scalability on Kubernetes Slack**](https://kubernetes.slack.com)
104 | * [**#forum-perfscale on CoreOS Slack**](https://coreos.slack.com)
105 |
106 |
107 |
108 | ### Credits
109 | Thanks to Mary Shakshober ( https://github.com/maryshak1996 ) for designing the logo.
110 |
--------------------------------------------------------------------------------
/docs/config.md:
--------------------------------------------------------------------------------
1 | Cerberus Config Components Explained
2 |
3 | * [Sample Config](#config)
4 | * [Watch Nodes](#watch-nodes)
5 | * [Watch Operators](#watch-cluster-operators)
6 | * [Watch Routes](#watch-routes)
7 | * [Watch Master Schedulable Status](#watch-master-schedulable-status)
8 | * [Watch Namespaces](#watch-namespaces)
9 | * [Watch Terminating Namespaces](#watch-terminating-namespaces)
10 | * [Publish Status](#publish-status)
11 | * [Inpsect Components](#inspect-components)
12 | * [Custom Checks](#custom-checks)
13 |
14 | ### Config
15 | Set the components to monitor and the tunings like duration to wait between each check in the config file located at config/config.yaml. A sample config looks like:
16 |
17 | ```
18 | cerberus:
19 | distribution: openshift # Distribution can be kubernetes or openshift
20 | kubeconfig_path: /root/.kube/config # Path to kubeconfig
21 | port: 8081 # http server port where cerberus status is published
22 | watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes
23 | watch_cluster_operators: True # Set to True for cerberus to monitor cluster operators
24 | watch_terminating_namespaces: True # Set to True to monitor if any namespaces (set below under 'watch_namespaces' start terminating
25 | watch_url_routes:
26 | # Route url's you want to monitor, this is a double array with the url and optional authorization parameter
27 | watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label.
28 | enabled: True
29 | label: node-role.kubernetes.io/master
30 | watch_namespaces: # List of namespaces to be monitored
31 | - openshift-etcd
32 | - openshift-apiserver
33 | - openshift-kube-apiserver
34 | - openshift-monitoring
35 | - openshift-kube-controller-manager
36 | - openshift-machine-api
37 | - openshift-kube-scheduler
38 | - openshift-ingress
39 | - openshift-sdn # When enabled, it will check for the cluster sdn and monitor that namespace
40 | watch_namespaces_ignore_pattern: [] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces
41 | cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status
42 | inspect_components: False # Enable it only when OpenShift client is supported to run
43 | # When enabled, cerberus collects logs, events and metrics of failed components
44 |
45 | prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
46 | prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
47 | # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies.
48 |
49 | slack_integration: False # When enabled, cerberus reports the failed iterations in the slack channel
50 | # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures )
51 | # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's.
52 | watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.)
53 | Monday:
54 | Tuesday:
55 | Wednesday:
56 | Thursday:
57 | Friday:
58 | Saturday:
59 | Sunday:
60 | slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
61 |
62 | custom_checks:
63 | - custom_checks/custom_check_sample.py # Relative paths of files conataining additional user defined checks
64 |
65 | tunings:
66 | timeout: 20 # Number of seconds before requests fail
67 | iterations: 1 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled
68 | sleep_time: 3 # Sleep duration between each iteration
69 | kube_api_request_chunk_size: 250 # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness.
70 | daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever
71 | cores_usage_percentage: 0.5 # Set the fraction of cores to be used for multiprocessing
72 |
73 | database:
74 | database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored
75 | reuse_database: False # When enabled, the database is reused to store the failures
76 | ```
77 |
78 | #### Watch Nodes
79 | This flag returns any nodes where the KernelDeadlock is not set to False and does not have a `Ready` status
80 |
81 | #### Watch Cluster Operators
82 | When `watch_cluster_operators` is set to True, this will monitor the degraded status of all the cluster operators and report a failure if any are degraded.
83 | If set to False will not query or report the status of the cluster operators
84 |
85 |
86 | #### Watch Routes
87 | This parameter expects a double array with each item having the url and an optional bearer token or authorization for each of the url's to properly connect
88 |
89 | For example:
90 | ```
91 | watch_url_routes:
92 | - -
93 | - (optional)
94 | - - https://prometheus-k8s-openshift-monitoring.apps.****.devcluster.openshift.com
95 | - Bearer ****
96 | - - http://nodejs-mongodb-example-default.apps.****.devcluster.openshift.com
97 |
98 | ```
99 |
100 | #### Watch Master Schedulable Status
101 | When this check is enabled, cerberus queries each of the nodes for the given label and verifies the taint effect does not equal "NoSchedule"
102 | ```
103 | watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label.
104 | enabled: True
105 | label: