├── .codecov.yml ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ ├── ci.yaml │ ├── generate.yaml │ └── helmrelease.yaml ├── .gitignore ├── .golangci.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── cmd └── sloth │ ├── commands │ ├── commands.go │ ├── generate.go │ ├── helpers.go │ ├── k8scontroller.go │ ├── validate.go │ └── version.go │ └── main.go ├── deploy └── kubernetes │ ├── helm │ └── sloth │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── crds │ │ └── sloth.slok.dev_prometheusservicelevels.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── cluster-role-binding.yaml │ │ ├── cluster-role.yaml │ │ ├── configmap.yaml │ │ ├── deployment.yaml │ │ ├── pod-monitor.yaml │ │ └── service-account.yaml │ │ ├── tests │ │ ├── go.mod │ │ ├── go.sum │ │ ├── helm_chart_test.go │ │ ├── testdata │ │ │ └── output │ │ │ │ ├── cluster_role_binding_custom.yaml │ │ │ │ ├── cluster_role_binding_default.yaml │ │ │ │ ├── cluster_role_custom.yaml │ │ │ │ ├── cluster_role_default.yaml │ │ │ │ ├── configmap_slo_config.yaml │ │ │ │ ├── deployment_custom.yaml │ │ │ │ ├── deployment_custom_no_extras.yaml │ │ │ │ ├── deployment_custom_slo_config.yaml │ │ │ │ ├── deployment_default.yaml │ │ │ │ ├── pod_monitor_custom.yaml │ │ │ │ ├── pod_monitor_default.yaml │ │ │ │ ├── sa_custom.yaml │ │ │ │ └── sa_default.yaml │ │ └── values_test.go │ │ └── values.yaml │ ├── kustomization.yaml │ └── raw │ ├── sloth-with-common-plugins.yaml │ └── sloth.yaml ├── docker ├── dev │ └── Dockerfile └── prod │ └── Dockerfile ├── docs └── img │ ├── logo.png │ └── sloth_small_dashboard.png ├── examples ├── _gen │ ├── custom_rule_group_interval.yml │ ├── getting-started.yml │ ├── home-wifi.yml │ ├── k8s-getting-started.yml │ ├── k8s-home-wifi.yml │ ├── k8s-multifile.yml │ ├── kubernetes-apiserver.yml │ ├── multifile.yml │ ├── no-alerts.yml │ ├── openslo-getting-started.yml │ ├── openslo-kubernetes-apiserver.yml │ ├── plugin-getting-started.yml │ ├── plugin-k8s-getting-started.yml │ └── raw-home-wifi.yml ├── custom_rule_group_interval.yml ├── getting-started.yml ├── home-wifi.yml ├── k8s-getting-started.yml ├── k8s-home-wifi.yml ├── k8s-multifile.yml ├── kubernetes-apiserver.yml ├── multifile.yml ├── no-alerts.yml ├── openslo-getting-started.yml ├── openslo-kubernetes-apiserver.yml ├── plugin-getting-started.yml ├── plugin-k8s-getting-started.yml ├── plugins │ └── getting-started │ │ └── availability │ │ └── plugin.go ├── raw-home-wifi.yml └── windows │ ├── 7d.yaml │ └── custom-30d.yaml ├── go.mod ├── go.sum ├── internal ├── alert │ ├── alert.go │ ├── alert_test.go │ ├── window.go │ └── windows │ │ ├── google-28d.yaml │ │ └── google-30d.yaml ├── app │ ├── generate │ │ ├── noop.go │ │ ├── prometheus.go │ │ └── prometheus_test.go │ └── kubecontroller │ │ ├── handler.go │ │ └── retriever.go ├── info │ └── info.go ├── k8sprometheus │ ├── helpers.go │ ├── k8sprometheusmock │ │ └── prometheus_rules_ensurer.go │ ├── kubernetes.go │ ├── model.go │ ├── model_test.go │ ├── spec.go │ ├── spec_test.go │ ├── storage.go │ └── storage_test.go ├── log │ ├── log.go │ └── logrus │ │ └── logrus.go ├── openslo │ ├── spec.go │ └── spec_test.go └── prometheus │ ├── alert_rules.go │ ├── alert_rules_test.go │ ├── conventions.go │ ├── helpers.go │ ├── model.go │ ├── model_test.go │ ├── prometheusmock │ └── file_manager.go │ ├── recording_rules.go │ ├── recording_rules_test.go │ ├── sli_plugin.go │ ├── sli_plugin_test.go │ ├── spec.go │ ├── spec_test.go │ ├── storage.go │ └── storage_test.go ├── pkg ├── kubernetes │ ├── api │ │ └── sloth │ │ │ ├── register.go │ │ │ └── v1 │ │ │ ├── README.md │ │ │ ├── doc.go │ │ │ ├── register.go │ │ │ ├── types.go │ │ │ └── zz_generated.deepcopy.go │ └── gen │ │ ├── clientset │ │ └── versioned │ │ │ ├── clientset.go │ │ │ ├── doc.go │ │ │ ├── fake │ │ │ ├── clientset_generated.go │ │ │ ├── doc.go │ │ │ └── register.go │ │ │ ├── scheme │ │ │ ├── doc.go │ │ │ └── register.go │ │ │ └── typed │ │ │ └── sloth │ │ │ └── v1 │ │ │ ├── doc.go │ │ │ ├── fake │ │ │ ├── doc.go │ │ │ ├── fake_prometheusservicelevel.go │ │ │ └── fake_sloth_client.go │ │ │ ├── generated_expansion.go │ │ │ ├── prometheusservicelevel.go │ │ │ └── sloth_client.go │ │ └── crd │ │ └── sloth.slok.dev_prometheusservicelevels.yaml └── prometheus │ ├── alertwindows │ └── v1 │ │ ├── README.md │ │ └── v1.go │ ├── api │ └── v1 │ │ ├── README.md │ │ └── v1.go │ └── plugin │ └── v1 │ └── v1.go ├── scripts ├── build │ ├── bin │ │ ├── build-all.sh │ │ ├── build-raw.sh │ │ └── build.sh │ └── docker │ │ ├── build-image-dev.sh │ │ ├── build-image.sh │ │ ├── build-publish-image-all.sh │ │ └── publish-image.sh ├── check │ ├── check.sh │ ├── helm-test.sh │ ├── integration-test-cli.sh │ ├── integration-test-k8s.sh │ ├── integration-test.sh │ └── unit-test.sh ├── deploygen.sh ├── deps.sh ├── examplesgen.sh ├── gogen.sh └── kubegen.sh └── test └── integration ├── crd └── prometheus-operator-crd.yaml ├── k8scontroller ├── exp_base_28d_test.go ├── exp_base_7d_test.go ├── exp_base_test.go ├── exp_plugin_test.go ├── helpers.go ├── k8scontroller_test.go ├── plugin │ └── plugin.go └── windows │ └── 7d.yaml ├── prometheus ├── generate_test.go ├── helpers.go ├── plugin │ └── plugin.go ├── testdata │ ├── in-base-k8s.yaml │ ├── in-base.yaml │ ├── in-invalid-version.yaml │ ├── in-multifile-k8s.yaml │ ├── in-multifile.yaml │ ├── in-openslo.yaml │ ├── in-plugin.yaml │ ├── out-base-28d.yaml.tpl │ ├── out-base-custom-windows-7d.yaml.tpl │ ├── out-base-extra-labels.yaml.tpl │ ├── out-base-k8s.yaml.tpl │ ├── out-base-no-alerts.yaml.tpl │ ├── out-base-no-recordings.yaml.tpl │ ├── out-base.yaml.tpl │ ├── out-multifile-k8s.yaml.tpl │ ├── out-multifile.yaml.tpl │ ├── out-openslo.yaml.tpl │ ├── out-plugin.yaml.tpl │ └── validate │ │ ├── bad │ │ ├── bad-aa.yaml │ │ ├── bad-ab.yaml │ │ ├── bad-ba.yaml │ │ ├── bad-k8s.yaml │ │ ├── bad-multi-k8s.yaml │ │ ├── bad-multi.yaml │ │ └── bad-openslo.yaml │ │ └── good │ │ ├── good-aa.yaml │ │ ├── good-ab.yaml │ │ ├── good-ba.yaml │ │ ├── good-k8s.yaml │ │ ├── good-multi-k8s.yaml │ │ ├── good-multi.yaml │ │ └── good-openslo.yaml ├── validate_test.go └── windows │ └── 7d.yaml └── testutils └── cmd.go /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | range: 70..90 # First number represents red, and second represents green. 3 | status: 4 | patch: false 5 | project: 6 | default: 7 | # Allow going down 1% before being a failure. 8 | threshold: 1% 9 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @slok 2 | 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | ignore: 8 | # Ignore Kubernetes dependencies to have full control on them. 9 | - dependency-name: "k8s.io/*" 10 | - package-ecosystem: "github-actions" 11 | directory: "/" 12 | schedule: 13 | interval: "daily" 14 | - package-ecosystem: "docker" 15 | directory: "/docker/dev" 16 | schedule: 17 | interval: "daily" 18 | - package-ecosystem: "docker" 19 | directory: "/docker/prod" 20 | schedule: 21 | interval: "daily" 22 | -------------------------------------------------------------------------------- /.github/workflows/generate.yaml: -------------------------------------------------------------------------------- 1 | # Sample job that allows you to download the generated files as Artifacts from the Github Actions page 2 | 3 | name: SLO generation 4 | 5 | on: 6 | # Allows you to run this workflow manually from the Actions tab 7 | workflow_dispatch: 8 | 9 | jobs: 10 | generate-slo-job-1: 11 | name: Generate the SLOs 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: download and setup generator binary 16 | run: | 17 | wget https://github.com/slok/sloth/releases/download/v0.9.0/sloth-linux-amd64 18 | chmod +x sloth-linux-amd64 19 | ./sloth-linux-amd64 generate -i ./examples/getting-started.yml -o ./examples/_gen/getting-started.yml 20 | ./sloth-linux-amd64 generate -i ./examples/no-alerts.yml -o ./examples/_gen/no-alerts.yml 21 | - name: 'Upload directory with generated SLOs' 22 | uses: actions/upload-artifact@v3 23 | with: 24 | name: SLOs 25 | path: examples/_gen/ 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/helmrelease.yaml: -------------------------------------------------------------------------------- 1 | name: Release Charts 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "deploy/kubernetes/helm/**" 9 | workflow_dispatch: 10 | 11 | jobs: 12 | release: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Configure Git 21 | run: | 22 | git config user.name "$GITHUB_ACTOR" 23 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 24 | 25 | - name: Install Helm 26 | uses: azure/setup-helm@v3.4 27 | with: 28 | version: v3.7.1 29 | 30 | - name: Run chart-releaser 31 | uses: helm/chart-releaser-action@v1.6.0 32 | with: 33 | charts_dir: deploy/kubernetes/helm 34 | env: 35 | CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" 36 | CR_RELEASE_NAME_TEMPLATE: "sloth-helm-chart-{{ .Version }}" 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Vendor directory 15 | vendor/ 16 | 17 | # Test coverage. 18 | .test_coverage.txt 19 | 20 | # Binaries 21 | /bin 22 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | run: 3 | timeout: 3m 4 | build-tags: 5 | - integration 6 | 7 | linters: 8 | enable: 9 | - misspell 10 | - goimports 11 | - revive 12 | - gofmt 13 | - depguard 14 | - godot 15 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | Sloth is [Apache 2.0 licensed](LICENSE) and accepts contributions via GitHub 4 | pull requests. This document outlines some of the conventions on development 5 | workflow, commit message formatting, contact points and other resources to make 6 | it easier to get your contribution accepted. 7 | 8 | We gratefully welcome improvements to issues and documentation as well as to code. 9 | 10 | ## Getting Started 11 | 12 | - Fork the repository on GitHub 13 | - Read the [README](README.md#getting-started) for getting started. 14 | - If you want to contribute as a developer, continue reading this document for further instructions 15 | - Play with the project, submit bugs, submit pull requests! 16 | 17 | ## Contribution workflow 18 | 19 | This is a rough outline of how to prepare a contribution: 20 | 21 | - Fork the repository. 22 | - Create a topic branch from where you want to base your work (usually branched from master). 23 | - Make commits of logical units. 24 | - Make sure your commit messages are clear and self-explanatory. 25 | - Push your changes to a topic branch in your fork of the repository. 26 | - If you changed code, add automated tests to cover your changes. 27 | - Submit a pull request from your fork to the original repository. 28 | 29 | ## Running the application 30 | 31 | ### CLI 32 | 33 | To run the CLI you can use the example specs. Some examples: 34 | 35 | ```bash 36 | go run ./cmd/sloth generate -i ./examples/getting-started.yml 37 | 38 | go run ./cmd/sloth/ validate -i ./examples/ -p ./examples/plugins/ -e _gen 39 | ``` 40 | 41 | ### Kubernetes 42 | 43 | To run Sloth in a controller mode you can run it in multiple ways, depending on the part that you are working on it may be helpful one or the other. 44 | 45 | > Apart the options that we will describe next, Kuberentes controller mode has multiple options that can be used to develop, deploy in different ways or apply maintenance, like selecting one single namespace, use a label selector... Check them with `sloth controller --help` 46 | 47 | #### Without a cluster 48 | 49 | If you are not developing something that needs a real Kubernetes connection, Sloth can run without a Kubernetes cluster with fake memory K8s memory based clients, use `--mode="fake"` 50 | 51 | Example: 52 | 53 | ```bash 54 | go run ./cmd/sloth/ controller --mode=fake --debug 55 | ``` 56 | 57 | #### With a local cluster 58 | 59 | If you need a Kubernetes connection or develop using more realistic setup, you can connect to any Kubernetes cluster using local credentials using `--kube-local` flag. 60 | 61 | ```bash 62 | go run ./cmd/sloth/ controller --kube-local 63 | ``` 64 | 65 | #### Dry run 66 | 67 | If you need to set Sloth in dry-run mode (read-only operations), you case use `--mode=dry-run`. 68 | 69 | ```bash 70 | go run ./cmd/sloth/ controller --mode=dry-run 71 | ``` 72 | 73 | You can use this mode with `--kube-local`. 74 | 75 | ```bash 76 | go run ./cmd/sloth/ controller --kube-local --mode=dry-run 77 | ``` 78 | 79 | ## Automated checks and unit tests 80 | 81 | You can check your code satisfies project standards by using: 82 | 83 | ```bash 84 | make check 85 | ``` 86 | 87 | You can run the unit tests by doing: 88 | 89 | ```bash 90 | make test 91 | ``` 92 | 93 | ## Integration tests 94 | 95 | > When running the tests if you don't have any of the required dependencies, the tests will be skipped 96 | 97 | ### CLI 98 | 99 | First you will need to build the binary (you can use `make build`). 100 | 101 | Search your binary, for example `./bin/sloth-linux-amd64` and set as the binary to execute the integration tests: 102 | 103 | ```bash 104 | export SLOTH_INTEGRATION_BINARY=${PWD}/bin/sloth-linux-amd64 105 | ``` 106 | 107 | Now you can run the tests: 108 | 109 | ```bash 110 | make ci-integration-cli 111 | ``` 112 | 113 | ### Kubernetes 114 | 115 | For Kubernetes you will need a cluster, the easiest way is to create a cluster using [Kind], lets see an example by creating a cluster and exporting the access configuration. 116 | 117 | ```bash 118 | kind create cluster --name sloth 119 | kind get kubeconfig --name sloth > /tmp/kind-sloth.kubeconfig 120 | ``` 121 | 122 | Prepare the required CRDs on the cluster: 123 | 124 | ```bash 125 | kubectl --kubeconfig=/tmp/kind-sloth.kubeconfig apply -f ./pkg/kubernetes/gen/crd/ 126 | kubectl --kubeconfig=/tmp/kind-sloth.kubeconfig apply -f ./test/integration/crd 127 | ``` 128 | 129 | Now we are ready, we need to prepare the integration tests settings that point to the binary of sloth we want to use (build with `make build`) and the Kubernetes cluster access config. 130 | 131 | ```bash 132 | export SLOTH_INTEGRATION_BINARY=${PWD}/bin/sloth-linux-amd64 133 | export SLOTH_INTEGRATION_KUBE_CONFIG=/tmp/kind-sloth.kubeconfig 134 | ``` 135 | 136 | Execute the tests: 137 | 138 | ```bash 139 | make ci-integration-k8s 140 | ``` 141 | 142 | ## Profiling 143 | 144 | By default Sloth will set [pprof] on metrics ports (`8081`). 145 | 146 | Check this [pprof cheatsheet][pprof-cheatsheet]. 147 | 148 | [kind]: https://github.com/kubernetes-sigs/kind 149 | [pprof-cheatsheet]: https://gist.github.com/slok/33dad1d0d0bae07977e6d32bcc010188 150 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL := $(shell which bash) 3 | OSTYPE := $(shell uname) 4 | DOCKER := $(shell command -v docker) 5 | GID := $(shell id -g) 6 | UID := $(shell id -u) 7 | VERSION ?= $(shell git describe --tags --always) 8 | 9 | UNIT_TEST_CMD := ./scripts/check/unit-test.sh 10 | INTEGRATION_TEST_CMD := ./scripts/check/integration-test.sh 11 | INTEGRATION_TEST_K8S_CMD := ./scripts/check/integration-test-k8s.sh 12 | INTEGRATION_TEST_CLI_CMD := ./scripts/check/integration-test-cli.sh 13 | HELM_TEST_CMD := ./scripts/check/helm-test.sh 14 | CHECK_CMD := ./scripts/check/check.sh 15 | 16 | DEV_IMAGE_NAME := local/sloth-dev 17 | PROD_IMAGE_NAME ?= ghcr.io/slok/sloth 18 | 19 | DOCKER_RUN_CMD := docker run --env ostype=$(OSTYPE) -v ${PWD}:/src --rm ${DEV_IMAGE_NAME} 20 | BUILD_BINARY_CMD := VERSION=${VERSION} ./scripts/build/bin/build.sh 21 | BUILD_BINARY_ALL_CMD := VERSION=${VERSION} ./scripts/build/bin/build-all.sh 22 | BUILD_DEV_IMAGE_CMD := IMAGE=${DEV_IMAGE_NAME} DOCKER_FILE_PATH=./docker/dev/Dockerfile VERSION=latest ./scripts/build/docker/build-image-dev.sh 23 | BUILD_PROD_IMAGE_CMD := IMAGE=${PROD_IMAGE_NAME} DOCKER_FILE_PATH=./docker/prod/Dockerfile VERSION=${VERSION} ./scripts/build/docker/build-image.sh 24 | BUILD_PUBLISH_PROD_IMAGE_ALL_CMD := IMAGE=${PROD_IMAGE_NAME} DOCKER_FILE_PATH=./docker/prod/Dockerfile VERSION=${VERSION} ./scripts/build/docker/build-publish-image-all.sh 25 | PUBLISH_PROD_IMAGE_CMD := IMAGE=${PROD_IMAGE_NAME} VERSION=${VERSION} ./scripts/build/docker/publish-image.sh 26 | 27 | 28 | help: ## Show this help 29 | @echo "Help" 30 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-20s\033[93m %s\n", $$1, $$2}' 31 | 32 | .PHONY: default 33 | default: help 34 | 35 | .PHONY: build-image 36 | build-image: ## Builds the production docker image. 37 | @$(BUILD_PROD_IMAGE_CMD) 38 | 39 | build-publish-image-all: ## Builds and publishes all the production docker images (multiarch). 40 | @$(BUILD_PUBLISH_PROD_IMAGE_ALL_CMD) 41 | 42 | .PHONY: build-dev-image 43 | build-dev-image: ## Builds the development docker image. 44 | @$(BUILD_DEV_IMAGE_CMD) 45 | 46 | build: build-dev-image ## Builds the production binary. 47 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(BUILD_BINARY_CMD)' 48 | 49 | build-all: build-dev-image ## Builds all archs production binaries. 50 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(BUILD_BINARY_ALL_CMD)' 51 | 52 | .PHONY: test 53 | test: build-dev-image ## Runs unit test. 54 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(UNIT_TEST_CMD)' 55 | 56 | .PHONY: helm-test 57 | helm-test: build-dev-image ## Runs helm chart test. 58 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(HELM_TEST_CMD)' 59 | 60 | .PHONY: check 61 | check: build-dev-image ## Runs checks. 62 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(CHECK_CMD)' 63 | 64 | .PHONY: integration 65 | integration: build-dev-image ## Runs integration test. 66 | @$(DOCKER_RUN_CMD) /bin/sh -c '$(INTEGRATION_TEST_CMD)' 67 | 68 | .PHONY: go-gen 69 | go-gen: build-dev-image ## Generates go based code. 70 | @$(DOCKER_RUN_CMD) /bin/sh -c './scripts/gogen.sh' 71 | 72 | .PHONY: kube-gen 73 | kube-gen: build-dev-image ## Generates go based code. 74 | /bin/sh -c './scripts/kubegen.sh' 75 | 76 | .PHONY: examples-gen 77 | examples-gen: build-dev-image ## Generates sloth examples. 78 | /bin/sh -c './scripts/examplesgen.sh' 79 | 80 | .PHONY: deploy-gen 81 | deploy-gen: build-dev-image ## Generates sloth deploy. 82 | /bin/sh -c './scripts/deploygen.sh' 83 | 84 | .PHONY: gen 85 | gen: kube-gen go-gen examples-gen deploy-gen ## Generates all. 86 | 87 | .PHONY: deps 88 | deps: ## Fixes the dependencies 89 | @$(DOCKER_RUN_CMD) /bin/sh -c './scripts/deps.sh' 90 | 91 | .PHONY: ci-build 92 | ci-build: ## Builds the production binary in CI environment (without docker). 93 | @$(BUILD_BINARY_CMD) 94 | 95 | .PHONY: ci-unit-test 96 | ci-test: ## Runs unit test in CI environment (without docker). 97 | @$(UNIT_TEST_CMD) 98 | 99 | .PHONY: ci-helm-test 100 | ci-helm-test: ## Runs helm chart tests in CI environment (without docker). 101 | @$(HELM_TEST_CMD) 102 | 103 | .PHONY: ci-check 104 | ci-check: ## Runs checks in CI environment (without docker). 105 | @$(CHECK_CMD) 106 | 107 | .PHONY: ci-integration 108 | ci-integration: ## Runs integration test in CI environment (without docker). 109 | @$(INTEGRATION_TEST_CMD) 110 | 111 | .PHONY: ci-integration-cli 112 | ci-integration-cli: ## Runs integration test for CLI in CI environment (without docker). 113 | @$(INTEGRATION_TEST_CLI_CMD) 114 | 115 | .PHONY: ci-integration-k8s 116 | ci-integration-k8s: ## Runs integration test for K8s in CI environment (without docker). 117 | @$(INTEGRATION_TEST_K8S_CMD) 118 | -------------------------------------------------------------------------------- /cmd/sloth/commands/commands.go: -------------------------------------------------------------------------------- 1 | package commands 2 | 3 | import ( 4 | "context" 5 | "io" 6 | 7 | "gopkg.in/alecthomas/kingpin.v2" 8 | 9 | "github.com/slok/sloth/internal/log" 10 | ) 11 | 12 | const ( 13 | // LoggerTypeDefault is the logger default type. 14 | LoggerTypeDefault = "default" 15 | // LoggerTypeJSON is the logger json type. 16 | LoggerTypeJSON = "json" 17 | ) 18 | 19 | // Command represents an application command, all commands that want to be executed 20 | // should implement and setup on main. 21 | type Command interface { 22 | Name() string 23 | Run(ctx context.Context, config RootConfig) error 24 | } 25 | 26 | // RootConfig represents the root command configuration and global configuration 27 | // for all the commands. 28 | type RootConfig struct { 29 | // Global flags. 30 | Debug bool 31 | NoLog bool 32 | NoColor bool 33 | LoggerType string 34 | 35 | // Global instances. 36 | Stdin io.Reader 37 | Stdout io.Writer 38 | Stderr io.Writer 39 | Logger log.Logger 40 | } 41 | 42 | // NewRootConfig initializes the main root configuration. 43 | func NewRootConfig(app *kingpin.Application) *RootConfig { 44 | c := &RootConfig{} 45 | 46 | // Register. 47 | app.Flag("debug", "Enable debug mode.").BoolVar(&c.Debug) 48 | app.Flag("no-log", "Disable logger.").BoolVar(&c.NoLog) 49 | app.Flag("no-color", "Disable logger color.").BoolVar(&c.NoColor) 50 | app.Flag("logger", "Selects the logger type.").Default(LoggerTypeDefault).EnumVar(&c.LoggerType, LoggerTypeDefault, LoggerTypeJSON) 51 | 52 | return c 53 | } 54 | -------------------------------------------------------------------------------- /cmd/sloth/commands/helpers.go: -------------------------------------------------------------------------------- 1 | package commands 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "io/fs" 8 | "path/filepath" 9 | "regexp" 10 | "strings" 11 | 12 | "github.com/slok/sloth/internal/log" 13 | "github.com/slok/sloth/internal/prometheus" 14 | ) 15 | 16 | var ( 17 | splitMarkRe = regexp.MustCompile("(?m)^---") 18 | rmCommentsRe = regexp.MustCompile("(?m)^#.*$") 19 | ) 20 | 21 | func splitYAML(data []byte) []string { 22 | // Santize. 23 | data = bytes.TrimSpace(data) 24 | data = rmCommentsRe.ReplaceAll(data, []byte("")) 25 | 26 | // Split (YAML can declare multiple files in the same file using `---`). 27 | dataSplit := splitMarkRe.Split(string(data), -1) 28 | 29 | // Remove empty splits. 30 | nonEmptyData := []string{} 31 | for _, d := range dataSplit { 32 | d = strings.TrimSpace(d) 33 | if d != "" { 34 | nonEmptyData = append(nonEmptyData, d) 35 | } 36 | } 37 | 38 | return nonEmptyData 39 | } 40 | 41 | func createPluginLoader(ctx context.Context, logger log.Logger, paths []string) (*prometheus.FileSLIPluginRepo, error) { 42 | config := prometheus.FileSLIPluginRepoConfig{ 43 | Paths: paths, 44 | Logger: logger, 45 | } 46 | sliPluginRepo, err := prometheus.NewFileSLIPluginRepo(config) 47 | if err != nil { 48 | return nil, fmt.Errorf("could not create file SLI plugin repository: %w", err) 49 | } 50 | 51 | return sliPluginRepo, nil 52 | } 53 | 54 | func discoverSLOManifests(logger log.Logger, exclude, include *regexp.Regexp, path string) ([]string, error) { 55 | logger = logger.WithValues(log.Kv{"svc": "SLODiscovery"}) 56 | 57 | paths := []string{} 58 | err := filepath.Walk(path, func(path string, info fs.FileInfo, err error) error { 59 | if err != nil { 60 | return err 61 | } 62 | 63 | if info.IsDir() { 64 | return nil 65 | } 66 | 67 | // Directories and non YAML files don't need to be handled. 68 | extension := strings.ToLower(filepath.Ext(path)) 69 | if info.IsDir() || (extension != ".yml" && extension != ".yaml") { 70 | return nil 71 | } 72 | 73 | // Filter by exclude or include (exclude has preference). 74 | if exclude != nil && exclude.MatchString(path) { 75 | logger.Debugf("Excluding path due to exclude filter %s", path) 76 | return nil 77 | } 78 | if include != nil && !include.MatchString(path) { 79 | logger.Debugf("Excluding path due to include filter %s", path) 80 | return nil 81 | } 82 | 83 | // If we reach here, path discovered. 84 | paths = append(paths, path) 85 | 86 | return nil 87 | }) 88 | 89 | if err != nil { 90 | return nil, fmt.Errorf("could not find files recursively: %w", err) 91 | } 92 | 93 | return paths, nil 94 | } 95 | -------------------------------------------------------------------------------- /cmd/sloth/commands/version.go: -------------------------------------------------------------------------------- 1 | package commands 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "gopkg.in/alecthomas/kingpin.v2" 8 | 9 | "github.com/slok/sloth/internal/info" 10 | ) 11 | 12 | type versionCommand struct{} 13 | 14 | // NewVersionCommand returns the version command. 15 | func NewVersionCommand(app *kingpin.Application) Command { 16 | c := &versionCommand{} 17 | app.Command("version", "Shows version.") 18 | 19 | return c 20 | } 21 | 22 | func (versionCommand) Name() string { return "version" } 23 | func (versionCommand) Run(ctx context.Context, config RootConfig) error { 24 | fmt.Fprintf(config.Stdout, info.Version) 25 | return nil 26 | } 27 | -------------------------------------------------------------------------------- /cmd/sloth/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "os" 8 | 9 | "github.com/sirupsen/logrus" 10 | "gopkg.in/alecthomas/kingpin.v2" 11 | 12 | "github.com/slok/sloth/cmd/sloth/commands" 13 | "github.com/slok/sloth/internal/info" 14 | "github.com/slok/sloth/internal/log" 15 | loglogrus "github.com/slok/sloth/internal/log/logrus" 16 | ) 17 | 18 | // Run runs the main application. 19 | func Run(ctx context.Context, args []string, stdin io.Reader, stdout, stderr io.Writer) error { 20 | app := kingpin.New("sloth", "Easy SLO generator.") 21 | app.DefaultEnvars() 22 | config := commands.NewRootConfig(app) 23 | 24 | // Setup commands (registers flags). 25 | generateCmd := commands.NewGenerateCommand(app) 26 | kubeCtrlCmd := commands.NewKubeControllerCommand(app) 27 | validateCmd := commands.NewValidateCommand(app) 28 | versionCmd := commands.NewVersionCommand(app) 29 | 30 | cmds := map[string]commands.Command{ 31 | generateCmd.Name(): generateCmd, 32 | kubeCtrlCmd.Name(): kubeCtrlCmd, 33 | validateCmd.Name(): validateCmd, 34 | versionCmd.Name(): versionCmd, 35 | } 36 | 37 | // Parse commandline. 38 | cmdName, err := app.Parse(args[1:]) 39 | if err != nil { 40 | return fmt.Errorf("invalid command configuration: %w", err) 41 | } 42 | 43 | // Set up global dependencies. 44 | config.Stdin = stdin 45 | config.Stdout = stdout 46 | config.Stderr = stderr 47 | config.Logger = getLogger(*config) 48 | 49 | // Execute command. 50 | err = cmds[cmdName].Run(ctx, *config) 51 | if err != nil { 52 | return fmt.Errorf("%q command failed: %w", cmdName, err) 53 | } 54 | 55 | return nil 56 | } 57 | 58 | // getLogger returns the application logger. 59 | func getLogger(config commands.RootConfig) log.Logger { 60 | if config.NoLog { 61 | return log.Noop 62 | } 63 | 64 | // If not logger disabled use logrus logger. 65 | logrusLog := logrus.New() 66 | logrusLog.Out = config.Stderr // By default logger goes to stderr (so it can split stdout prints). 67 | logrusLogEntry := logrus.NewEntry(logrusLog) 68 | 69 | if config.Debug { 70 | logrusLogEntry.Logger.SetLevel(logrus.DebugLevel) 71 | } 72 | 73 | // Log format. 74 | switch config.LoggerType { 75 | case commands.LoggerTypeDefault: 76 | logrusLogEntry.Logger.SetFormatter(&logrus.TextFormatter{ 77 | ForceColors: !config.NoColor, 78 | DisableColors: config.NoColor, 79 | }) 80 | case commands.LoggerTypeJSON: 81 | logrusLogEntry.Logger.SetFormatter(&logrus.JSONFormatter{}) 82 | } 83 | 84 | logger := loglogrus.NewLogrus(logrusLogEntry).WithValues(log.Kv{ 85 | "version": info.Version, 86 | }) 87 | 88 | logger.Debugf("Debug level is enabled") // Will log only when debug enabled. 89 | 90 | return logger 91 | } 92 | 93 | func main() { 94 | ctx := context.Background() 95 | err := Run(ctx, os.Args, os.Stdin, os.Stdout, os.Stderr) 96 | if err != nil { 97 | fmt.Fprintf(os.Stderr, "error: %s", err) 98 | os.Exit(1) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | 25 | # Custom. 26 | tests/ 27 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: sloth 3 | description: Base chart for Sloth. 4 | type: application 5 | home: https://github.com/linode-obs/sloth 6 | kubeVersion: ">= 1.19.0-0" 7 | version: 0.8.2 8 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{- define "sloth.name" -}} 2 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 3 | {{- end }} 4 | 5 | {{- define "sloth.fullname" -}} 6 | {{- if .Values.fullnameOverride }} 7 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 8 | {{- else }} 9 | {{- $name := default .Chart.Name .Values.nameOverride }} 10 | {{- if contains $name .Release.Name }} 11 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 12 | {{- else }} 13 | {{- printf "%s-%s" $name .Release.Name | trunc 63 | trimSuffix "-" }} 14 | {{- end }} 15 | {{- end }} 16 | {{- end }} 17 | 18 | 19 | {{- define "sloth.labels" -}} 20 | helm.sh/chart: {{ include "sloth.chart" . }} 21 | {{- if .Chart.AppVersion }} 22 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 23 | {{- end }} 24 | app.kubernetes.io/managed-by: {{ .Release.Service }} 25 | {{ include "sloth.selectorLabels" . }} 26 | {{- with .Values.labels }} 27 | {{ toYaml . }} 28 | {{- end }} 29 | {{- end }} 30 | 31 | 32 | 33 | {{- define "sloth.selectorLabels" -}} 34 | app: {{ include "sloth.name" . }} 35 | app.kubernetes.io/name: {{ include "sloth.name" . }} 36 | app.kubernetes.io/instance: {{ .Release.Name }} 37 | {{- end }} 38 | 39 | {{- define "sloth.chart" -}} 40 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 41 | {{- end }} 42 | {{- define "sloth.imagePullSecrets" -}} 43 | {{- range .Values.imagePullSecrets }} 44 | - {{ toYaml . | trim }} 45 | {{- end }} 46 | {{- end }} 47 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "sloth.fullname" . }} 6 | labels: 7 | {{- include "sloth.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: {{ include "sloth.fullname" . }} 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "sloth.fullname" . }} 15 | namespace: {{ .Release.Namespace }} 16 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/cluster-role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "sloth.fullname" . }} 6 | labels: 7 | {{- include "sloth.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: ["sloth.slok.dev"] 10 | resources: ["*"] 11 | verbs: ["*"] 12 | 13 | - apiGroups: ["monitoring.coreos.com"] 14 | resources: ["prometheusrules"] 15 | verbs: ["create", "list", "get", "update", "watch"] 16 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.customSloConfig.enabled }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ include "sloth.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "sloth.labels" . | nindent 4 }} 9 | data: 10 | window.yaml: | 11 | {{- toYaml .Values.customSloConfig.data | nindent 4 }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/pod-monitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.metrics.enabled }} 2 | --- 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PodMonitor 5 | metadata: 6 | name: {{ include "sloth.fullname" . }} 7 | namespace: {{ .Release.Namespace }} 8 | labels: 9 | {{- include "sloth.labels" . | nindent 4 }} 10 | {{- with .Values.metrics.prometheusLabels }} 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | spec: 14 | selector: 15 | matchLabels: 16 | {{- include "sloth.selectorLabels" . | nindent 6 }} 17 | podMetricsEndpoints: 18 | - port: metrics 19 | {{- with .Values.metrics.scrapeInterval }} 20 | interval: {{.}} 21 | {{- end }} 22 | {{- end }} -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/templates/service-account.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "sloth.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "sloth.labels" . | nindent 4 }} 9 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/cluster_role_binding_custom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/cluster-role-binding.yaml 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRoleBinding 5 | metadata: 6 | name: sloth-test 7 | labels: 8 | helm.sh/chart: sloth- 9 | app.kubernetes.io/managed-by: Helm 10 | app: sloth 11 | app.kubernetes.io/name: sloth 12 | app.kubernetes.io/instance: test 13 | label-from: test 14 | roleRef: 15 | apiGroup: rbac.authorization.k8s.io 16 | kind: ClusterRole 17 | name: sloth-test 18 | subjects: 19 | - kind: ServiceAccount 20 | name: sloth-test 21 | namespace: custom 22 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/cluster_role_binding_default.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/cluster-role-binding.yaml 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRoleBinding 5 | metadata: 6 | name: sloth 7 | labels: 8 | helm.sh/chart: sloth- 9 | app.kubernetes.io/managed-by: Helm 10 | app: sloth 11 | app.kubernetes.io/name: sloth 12 | app.kubernetes.io/instance: sloth 13 | roleRef: 14 | apiGroup: rbac.authorization.k8s.io 15 | kind: ClusterRole 16 | name: sloth 17 | subjects: 18 | - kind: ServiceAccount 19 | name: sloth 20 | namespace: default 21 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/cluster_role_custom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/cluster-role.yaml 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRole 5 | metadata: 6 | name: sloth-test 7 | labels: 8 | helm.sh/chart: sloth- 9 | app.kubernetes.io/managed-by: Helm 10 | app: sloth 11 | app.kubernetes.io/name: sloth 12 | app.kubernetes.io/instance: test 13 | label-from: test 14 | rules: 15 | - apiGroups: ["sloth.slok.dev"] 16 | resources: ["*"] 17 | verbs: ["*"] 18 | 19 | - apiGroups: ["monitoring.coreos.com"] 20 | resources: ["prometheusrules"] 21 | verbs: ["create", "list", "get", "update", "watch"] 22 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/cluster_role_default.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/cluster-role.yaml 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | kind: ClusterRole 5 | metadata: 6 | name: sloth 7 | labels: 8 | helm.sh/chart: sloth- 9 | app.kubernetes.io/managed-by: Helm 10 | app: sloth 11 | app.kubernetes.io/name: sloth 12 | app.kubernetes.io/instance: sloth 13 | rules: 14 | - apiGroups: ["sloth.slok.dev"] 15 | resources: ["*"] 16 | verbs: ["*"] 17 | 18 | - apiGroups: ["monitoring.coreos.com"] 19 | resources: ["prometheusrules"] 20 | verbs: ["create", "list", "get", "update", "watch"] 21 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/configmap_slo_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/configmap.yaml 3 | apiVersion: v1 4 | kind: ConfigMap 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | data: 16 | window.yaml: | 17 | customKey: customValue 18 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/deployment_custom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/deployment.yaml 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | spec: 16 | replicas: 1 17 | selector: 18 | matchLabels: 19 | app: sloth 20 | app.kubernetes.io/name: sloth 21 | app.kubernetes.io/instance: test 22 | template: 23 | metadata: 24 | labels: 25 | helm.sh/chart: sloth- 26 | app.kubernetes.io/managed-by: Helm 27 | app: sloth 28 | app.kubernetes.io/name: sloth 29 | app.kubernetes.io/instance: test 30 | label-from: test 31 | annotations: 32 | kubectl.kubernetes.io/default-container: sloth 33 | spec: 34 | serviceAccountName: sloth-test 35 | securityContext: 36 | fsGroup: 100 37 | runAsGroup: 1000 38 | runAsNonRoot: true 39 | runAsUser: 100 40 | containers: 41 | - name: sloth 42 | image: linode-obs/sloth-test:v1.42.42 43 | args: 44 | - kubernetes-controller 45 | - --resync-interval=17m 46 | - --workers=99 47 | - --namespace=somens 48 | - --label-selector=x=y,z!=y 49 | - --extra-labels=k1=v1 50 | - --extra-labels=k2=v2 51 | - --sli-plugins-path=/plugins 52 | - --disable-optimized-rules 53 | - --logger=default 54 | ports: 55 | - containerPort: 8081 56 | name: metrics 57 | protocol: TCP 58 | volumeMounts: 59 | - name: sloth-common-sli-plugins 60 | mountPath: /plugins/sloth-common-sli-plugins 61 | securityContext: 62 | allowPrivilegeEscalation: false 63 | resources: 64 | limits: 65 | cpu: 50m 66 | memory: 150Mi 67 | requests: 68 | cpu: 5m 69 | memory: 75Mi 70 | - name: git-sync-plugins 71 | image: k8s.gcr.io/git-sync/git-sync:v3.6.1 72 | args: 73 | - --repo=https://github.com/slok/sloth-test-common-sli-plugins 74 | - --branch=main 75 | - --wait=30 76 | - --webhook-url=http://localhost:8082/-/reload 77 | volumeMounts: 78 | - name: sloth-common-sli-plugins 79 | # Default path for git-sync. 80 | mountPath: /tmp/git 81 | securityContext: 82 | allowPrivilegeEscalation: false 83 | resources: 84 | limits: 85 | cpu: 50m 86 | memory: 100Mi 87 | requests: 88 | cpu: 5m 89 | memory: 50Mi 90 | volumes: 91 | - name: sloth-common-sli-plugins 92 | emptyDir: {} 93 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/deployment_custom_no_extras.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/deployment.yaml 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | spec: 16 | replicas: 1 17 | selector: 18 | matchLabels: 19 | app: sloth 20 | app.kubernetes.io/name: sloth 21 | app.kubernetes.io/instance: test 22 | template: 23 | metadata: 24 | labels: 25 | helm.sh/chart: sloth- 26 | app.kubernetes.io/managed-by: Helm 27 | app: sloth 28 | app.kubernetes.io/name: sloth 29 | app.kubernetes.io/instance: test 30 | label-from: test 31 | annotations: 32 | kubectl.kubernetes.io/default-container: sloth 33 | spec: 34 | serviceAccountName: sloth-test 35 | securityContext: 36 | fsGroup: 100 37 | runAsGroup: 1000 38 | runAsNonRoot: true 39 | runAsUser: 100 40 | containers: 41 | - name: sloth 42 | image: linode-obs/sloth-test:v1.42.42 43 | args: 44 | - kubernetes-controller 45 | - --resync-interval=17m 46 | - --workers=99 47 | - --namespace=somens 48 | - --label-selector=x=y,z!=y 49 | - --extra-labels=k1=v1 50 | - --extra-labels=k2=v2 51 | - --disable-optimized-rules 52 | - --logger=default 53 | securityContext: 54 | allowPrivilegeEscalation: false 55 | resources: 56 | limits: 57 | cpu: 50m 58 | memory: 150Mi 59 | requests: 60 | cpu: 5m 61 | memory: 75Mi 62 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/deployment_custom_slo_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/deployment.yaml 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | spec: 16 | replicas: 1 17 | selector: 18 | matchLabels: 19 | app: sloth 20 | app.kubernetes.io/name: sloth 21 | app.kubernetes.io/instance: test 22 | template: 23 | metadata: 24 | labels: 25 | helm.sh/chart: sloth- 26 | app.kubernetes.io/managed-by: Helm 27 | app: sloth 28 | app.kubernetes.io/name: sloth 29 | app.kubernetes.io/instance: test 30 | label-from: test 31 | annotations: 32 | kubectl.kubernetes.io/default-container: sloth 33 | checksum/config: 34 | spec: 35 | serviceAccountName: sloth-test 36 | securityContext: 37 | fsGroup: 100 38 | runAsGroup: 1000 39 | runAsNonRoot: true 40 | runAsUser: 100 41 | containers: 42 | - name: sloth 43 | image: linode-obs/sloth-test:v1.42.42 44 | args: 45 | - kubernetes-controller 46 | - --resync-interval=17m 47 | - --workers=99 48 | - --namespace=somens 49 | - --label-selector=x=y,z!=y 50 | - --extra-labels=k1=v1 51 | - --extra-labels=k2=v2 52 | - --disable-optimized-rules 53 | - --slo-period-windows-path=/windows 54 | - --logger=default 55 | ports: 56 | - containerPort: 8081 57 | name: metrics 58 | protocol: TCP 59 | volumeMounts: 60 | - name: sloth-windows 61 | mountPath: /windows 62 | securityContext: 63 | allowPrivilegeEscalation: false 64 | resources: 65 | limits: 66 | cpu: 50m 67 | memory: 150Mi 68 | requests: 69 | cpu: 5m 70 | memory: 75Mi 71 | volumes: 72 | - name: sloth-windows 73 | configMap: 74 | defaultMode: 420 75 | name: sloth-test 76 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/deployment_default.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/deployment.yaml 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: sloth 7 | namespace: default 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: sloth 14 | spec: 15 | replicas: 1 16 | selector: 17 | matchLabels: 18 | app: sloth 19 | app.kubernetes.io/name: sloth 20 | app.kubernetes.io/instance: sloth 21 | template: 22 | metadata: 23 | labels: 24 | helm.sh/chart: sloth- 25 | app.kubernetes.io/managed-by: Helm 26 | app: sloth 27 | app.kubernetes.io/name: sloth 28 | app.kubernetes.io/instance: sloth 29 | annotations: 30 | kubectl.kubernetes.io/default-container: sloth 31 | spec: 32 | serviceAccountName: sloth 33 | containers: 34 | - name: sloth 35 | image: ghcr.io/linode-obs/sloth:v0.13.1 36 | args: 37 | - kubernetes-controller 38 | - --sli-plugins-path=/plugins 39 | - --logger=default 40 | ports: 41 | - containerPort: 8081 42 | name: metrics 43 | protocol: TCP 44 | volumeMounts: 45 | - name: sloth-common-sli-plugins 46 | mountPath: /plugins/sloth-common-sli-plugins 47 | resources: 48 | limits: 49 | cpu: 50m 50 | memory: 150Mi 51 | requests: 52 | cpu: 5m 53 | memory: 75Mi 54 | - name: git-sync-plugins 55 | image: k8s.gcr.io/git-sync/git-sync:v3.6.1 56 | args: 57 | - --repo=https://github.com/slok/sloth-common-sli-plugins 58 | - --branch=main 59 | - --wait=30 60 | - --webhook-url=http://localhost:8082/-/reload 61 | volumeMounts: 62 | - name: sloth-common-sli-plugins 63 | # Default path for git-sync. 64 | mountPath: /tmp/git 65 | resources: 66 | limits: 67 | cpu: 50m 68 | memory: 100Mi 69 | requests: 70 | cpu: 5m 71 | memory: 50Mi 72 | volumes: 73 | - name: sloth-common-sli-plugins 74 | emptyDir: {} 75 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/pod_monitor_custom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/pod-monitor.yaml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PodMonitor 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | kp1: vp1 16 | kp2: vp2 17 | spec: 18 | selector: 19 | matchLabels: 20 | app: sloth 21 | app.kubernetes.io/name: sloth 22 | app.kubernetes.io/instance: test 23 | podMetricsEndpoints: 24 | - port: metrics 25 | interval: 45s 26 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/pod_monitor_default.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/pod-monitor.yaml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PodMonitor 5 | metadata: 6 | name: sloth 7 | namespace: default 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: sloth 14 | spec: 15 | selector: 16 | matchLabels: 17 | app: sloth 18 | app.kubernetes.io/name: sloth 19 | app.kubernetes.io/instance: sloth 20 | podMetricsEndpoints: 21 | - port: metrics 22 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/sa_custom.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/service-account.yaml 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: sloth-test 7 | namespace: custom 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: test 14 | label-from: test 15 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/testdata/output/sa_default.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/service-account.yaml 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: sloth 7 | namespace: default 8 | labels: 9 | helm.sh/chart: sloth- 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: sloth 14 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/tests/values_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | type msi = map[string]interface{} 4 | 5 | func defaultValues() msi { 6 | return msi{} 7 | } 8 | 9 | func customValues() msi { 10 | return msi{ 11 | "labels": msi{ 12 | "label-from": "test", 13 | }, 14 | 15 | "image": msi{ 16 | "repository": "linode-obs/sloth-test", 17 | "tag": "v1.42.42", 18 | }, 19 | 20 | "sloth": msi{ 21 | "resyncInterval": "17m", 22 | "workers": 99, 23 | "labelSelector": `x=y,z!=y`, 24 | "namespace": "somens", 25 | "optimizedRules": false, 26 | "extraLabels": msi{ 27 | "k1": "v1", 28 | "k2": "v2", 29 | }, 30 | }, 31 | 32 | "commonPlugins": msi{ 33 | "enabled": true, 34 | "gitRepo": msi{ 35 | "url": "https://github.com/slok/sloth-test-common-sli-plugins", 36 | "branch": "main", 37 | }, 38 | }, 39 | 40 | "metrics": msi{ 41 | "enabled": true, 42 | "scrapeInterval": "45s", 43 | "prometheusLabels": msi{ 44 | "kp1": "vp1", 45 | "kp2": "vp2", 46 | }, 47 | }, 48 | 49 | "customSloConfig": msi{ 50 | "data": msi{ 51 | "customKey": "customValue", 52 | }, 53 | }, 54 | 55 | "securityContext": msi{ 56 | "pod": msi{ 57 | "runAsNonRoot": true, 58 | "runAsGroup": 1000, 59 | "runAsUser": 100, 60 | "fsGroup": 100, 61 | }, 62 | "container": msi{ 63 | "allowPrivilegeEscalation": false, 64 | }, 65 | }, 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /deploy/kubernetes/helm/sloth/values.yaml: -------------------------------------------------------------------------------- 1 | labels: {} 2 | 3 | image: 4 | repository: ghcr.io/linode-obs/sloth 5 | tag: v0.13.1 6 | 7 | # -- Container resources: requests and limits for CPU, Memory 8 | resources: 9 | limits: 10 | cpu: 50m 11 | memory: 150Mi 12 | requests: 13 | cpu: 5m 14 | memory: 75Mi 15 | 16 | imagePullSecrets: [] 17 | # - name: secret1 18 | # - name: secret2 19 | 20 | sloth: 21 | resyncInterval: "" # The controller resync interval duration (e.g 15m). 22 | workers: 0 # The number of concurrent controller workers (e.g 5). 23 | labelSelector: "" # Sloth will handle only the ones that match the selector. 24 | namespace: "" # The namespace where sloth will the CRs to process. 25 | extraLabels: {} # Labels that will be added to all the generated SLO Rules. 26 | defaultSloPeriod: "" # The slo period used by sloth (e.g. 30d). 27 | optimizedRules: true # Reduce prom load for calculating period window burnrates. 28 | debug: 29 | enabled: false 30 | # Could be: default or json 31 | logger: default 32 | 33 | commonPlugins: 34 | enabled: true 35 | image: 36 | repository: k8s.gcr.io/git-sync/git-sync 37 | tag: v3.6.1 38 | gitRepo: 39 | url: https://github.com/slok/sloth-common-sli-plugins 40 | branch: main 41 | resources: 42 | limits: 43 | cpu: 50m 44 | memory: 100Mi 45 | requests: 46 | cpu: 5m 47 | memory: 50Mi 48 | 49 | metrics: 50 | enabled: true 51 | #scrapeInterval: 30s 52 | prometheusLabels: {} 53 | 54 | customSloConfig: 55 | enabled: false 56 | path: /windows 57 | data: {} 58 | # apiVersion: sloth.slok.dev/v1 59 | # kind: AlertWindows 60 | # spec: 61 | # ... See https://sloth.dev/usage/slo-period-windows/ 62 | 63 | # add deployment pod tolerations 64 | # tolerations: 65 | # - key: kubernetes.azure.com/scalesetpriority 66 | # operator: Equal 67 | # value: spot 68 | # effect: NoSchedule 69 | 70 | securityContext: 71 | pod: null 72 | # fsGroup: 100 73 | # runAsGroup: 1000 74 | # runAsNonRoot: true 75 | # runAsUser: 100 76 | container: null 77 | # allowPrivilegeEscalation: false 78 | -------------------------------------------------------------------------------- /deploy/kubernetes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - raw/sloth-with-common-plugins.yaml -------------------------------------------------------------------------------- /deploy/kubernetes/raw/sloth-with-common-plugins.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/service-account.yaml 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: sloth 7 | namespace: monitoring 8 | labels: 9 | helm.sh/chart: sloth-0.6.4 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: sloth 14 | --- 15 | # Source: sloth/templates/cluster-role.yaml 16 | apiVersion: rbac.authorization.k8s.io/v1 17 | kind: ClusterRole 18 | metadata: 19 | name: sloth 20 | labels: 21 | helm.sh/chart: sloth-0.6.4 22 | app.kubernetes.io/managed-by: Helm 23 | app: sloth 24 | app.kubernetes.io/name: sloth 25 | app.kubernetes.io/instance: sloth 26 | rules: 27 | - apiGroups: ["sloth.slok.dev"] 28 | resources: ["*"] 29 | verbs: ["*"] 30 | 31 | - apiGroups: ["monitoring.coreos.com"] 32 | resources: ["prometheusrules"] 33 | verbs: ["create", "list", "get", "update", "watch"] 34 | --- 35 | # Source: sloth/templates/cluster-role-binding.yaml 36 | apiVersion: rbac.authorization.k8s.io/v1 37 | kind: ClusterRoleBinding 38 | metadata: 39 | name: sloth 40 | labels: 41 | helm.sh/chart: sloth-0.6.4 42 | app.kubernetes.io/managed-by: Helm 43 | app: sloth 44 | app.kubernetes.io/name: sloth 45 | app.kubernetes.io/instance: sloth 46 | roleRef: 47 | apiGroup: rbac.authorization.k8s.io 48 | kind: ClusterRole 49 | name: sloth 50 | subjects: 51 | - kind: ServiceAccount 52 | name: sloth 53 | namespace: monitoring 54 | --- 55 | # Source: sloth/templates/deployment.yaml 56 | apiVersion: apps/v1 57 | kind: Deployment 58 | metadata: 59 | name: sloth 60 | namespace: monitoring 61 | labels: 62 | helm.sh/chart: sloth-0.6.4 63 | app.kubernetes.io/managed-by: Helm 64 | app: sloth 65 | app.kubernetes.io/name: sloth 66 | app.kubernetes.io/instance: sloth 67 | spec: 68 | replicas: 1 69 | selector: 70 | matchLabels: 71 | app: sloth 72 | app.kubernetes.io/name: sloth 73 | app.kubernetes.io/instance: sloth 74 | template: 75 | metadata: 76 | labels: 77 | helm.sh/chart: sloth-0.6.4 78 | app.kubernetes.io/managed-by: Helm 79 | app: sloth 80 | app.kubernetes.io/name: sloth 81 | app.kubernetes.io/instance: sloth 82 | annotations: 83 | kubectl.kubernetes.io/default-container: sloth 84 | spec: 85 | serviceAccountName: sloth 86 | containers: 87 | - name: sloth 88 | image: ghcr.io/linode-obs/sloth:v0.13.1 89 | args: 90 | - kubernetes-controller 91 | - --sli-plugins-path=/plugins 92 | ports: 93 | - containerPort: 8081 94 | name: metrics 95 | protocol: TCP 96 | volumeMounts: 97 | - name: sloth-common-sli-plugins 98 | mountPath: /plugins/sloth-common-sli-plugins 99 | resources: 100 | limits: 101 | cpu: 50m 102 | memory: 150Mi 103 | requests: 104 | cpu: 5m 105 | memory: 75Mi 106 | - name: git-sync-plugins 107 | image: k8s.gcr.io/git-sync/git-sync:v3.6.1 108 | args: 109 | - --repo=https://github.com/slok/sloth-common-sli-plugins 110 | - --branch=main 111 | - --wait=30 112 | - --webhook-url=http://localhost:8082/-/reload 113 | volumeMounts: 114 | - name: sloth-common-sli-plugins 115 | # Default path for git-sync. 116 | mountPath: /tmp/git 117 | resources: 118 | limits: 119 | cpu: 50m 120 | memory: 100Mi 121 | requests: 122 | cpu: 5m 123 | memory: 50Mi 124 | volumes: 125 | - name: sloth-common-sli-plugins 126 | emptyDir: {} 127 | --- 128 | # Source: sloth/templates/pod-monitor.yaml 129 | apiVersion: monitoring.coreos.com/v1 130 | kind: PodMonitor 131 | metadata: 132 | name: sloth 133 | namespace: monitoring 134 | labels: 135 | helm.sh/chart: sloth-0.6.4 136 | app.kubernetes.io/managed-by: Helm 137 | app: sloth 138 | app.kubernetes.io/name: sloth 139 | app.kubernetes.io/instance: sloth 140 | spec: 141 | selector: 142 | matchLabels: 143 | app: sloth 144 | app.kubernetes.io/name: sloth 145 | app.kubernetes.io/instance: sloth 146 | podMetricsEndpoints: 147 | - port: metrics 148 | -------------------------------------------------------------------------------- /deploy/kubernetes/raw/sloth.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: sloth/templates/service-account.yaml 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: sloth 7 | namespace: monitoring 8 | labels: 9 | helm.sh/chart: sloth-0.6.4 10 | app.kubernetes.io/managed-by: Helm 11 | app: sloth 12 | app.kubernetes.io/name: sloth 13 | app.kubernetes.io/instance: sloth 14 | --- 15 | # Source: sloth/templates/cluster-role.yaml 16 | apiVersion: rbac.authorization.k8s.io/v1 17 | kind: ClusterRole 18 | metadata: 19 | name: sloth 20 | labels: 21 | helm.sh/chart: sloth-0.6.4 22 | app.kubernetes.io/managed-by: Helm 23 | app: sloth 24 | app.kubernetes.io/name: sloth 25 | app.kubernetes.io/instance: sloth 26 | rules: 27 | - apiGroups: ["sloth.slok.dev"] 28 | resources: ["*"] 29 | verbs: ["*"] 30 | 31 | - apiGroups: ["monitoring.coreos.com"] 32 | resources: ["prometheusrules"] 33 | verbs: ["create", "list", "get", "update", "watch"] 34 | --- 35 | # Source: sloth/templates/cluster-role-binding.yaml 36 | apiVersion: rbac.authorization.k8s.io/v1 37 | kind: ClusterRoleBinding 38 | metadata: 39 | name: sloth 40 | labels: 41 | helm.sh/chart: sloth-0.6.4 42 | app.kubernetes.io/managed-by: Helm 43 | app: sloth 44 | app.kubernetes.io/name: sloth 45 | app.kubernetes.io/instance: sloth 46 | roleRef: 47 | apiGroup: rbac.authorization.k8s.io 48 | kind: ClusterRole 49 | name: sloth 50 | subjects: 51 | - kind: ServiceAccount 52 | name: sloth 53 | namespace: monitoring 54 | --- 55 | # Source: sloth/templates/deployment.yaml 56 | apiVersion: apps/v1 57 | kind: Deployment 58 | metadata: 59 | name: sloth 60 | namespace: monitoring 61 | labels: 62 | helm.sh/chart: sloth-0.6.4 63 | app.kubernetes.io/managed-by: Helm 64 | app: sloth 65 | app.kubernetes.io/name: sloth 66 | app.kubernetes.io/instance: sloth 67 | spec: 68 | replicas: 1 69 | selector: 70 | matchLabels: 71 | app: sloth 72 | app.kubernetes.io/name: sloth 73 | app.kubernetes.io/instance: sloth 74 | template: 75 | metadata: 76 | labels: 77 | helm.sh/chart: sloth-0.6.4 78 | app.kubernetes.io/managed-by: Helm 79 | app: sloth 80 | app.kubernetes.io/name: sloth 81 | app.kubernetes.io/instance: sloth 82 | annotations: 83 | kubectl.kubernetes.io/default-container: sloth 84 | spec: 85 | serviceAccountName: sloth 86 | containers: 87 | - name: sloth 88 | image: ghcr.io/linode-obs/sloth:v0.13.1 89 | args: 90 | - kubernetes-controller 91 | ports: 92 | - containerPort: 8081 93 | name: metrics 94 | protocol: TCP 95 | resources: 96 | limits: 97 | cpu: 50m 98 | memory: 150Mi 99 | requests: 100 | cpu: 5m 101 | memory: 75Mi 102 | --- 103 | # Source: sloth/templates/pod-monitor.yaml 104 | apiVersion: monitoring.coreos.com/v1 105 | kind: PodMonitor 106 | metadata: 107 | name: sloth 108 | namespace: monitoring 109 | labels: 110 | helm.sh/chart: sloth-0.6.4 111 | app.kubernetes.io/managed-by: Helm 112 | app: sloth 113 | app.kubernetes.io/name: sloth 114 | app.kubernetes.io/instance: sloth 115 | spec: 116 | selector: 117 | matchLabels: 118 | app: sloth 119 | app.kubernetes.io/name: sloth 120 | app.kubernetes.io/instance: sloth 121 | podMetricsEndpoints: 122 | - port: metrics 123 | -------------------------------------------------------------------------------- /docker/dev/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23 2 | 3 | LABEL org.opencontainers.image.source https://github.com/slok/sloth 4 | 5 | ARG GOLANGCI_LINT_VERSION="1.61.0" 6 | ARG MOCKERY_VERSION="2.46.3" 7 | ARG GOMARKDOC_VERSION="0.4.1" 8 | ARG HELM_VERSION="3.10.0" 9 | ARG ostype=Linux 10 | 11 | RUN apt-get update && apt-get install -y \ 12 | git \ 13 | bash \ 14 | zip 15 | 16 | 17 | RUN wget https://github.com/golangci/golangci-lint/releases/download/v${GOLANGCI_LINT_VERSION}/golangci-lint-${GOLANGCI_LINT_VERSION}-linux-amd64.tar.gz && \ 18 | tar zxvf golangci-lint-${GOLANGCI_LINT_VERSION}-linux-amd64.tar.gz --strip 1 -C /usr/local/bin/ && \ 19 | rm golangci-lint-${GOLANGCI_LINT_VERSION}-linux-amd64.tar.gz && \ 20 | \ 21 | wget https://github.com/vektra/mockery/releases/download/v${MOCKERY_VERSION}/mockery_${MOCKERY_VERSION}_Linux_x86_64.tar.gz && \ 22 | tar zxvf mockery_${MOCKERY_VERSION}_Linux_x86_64.tar.gz -C /tmp && \ 23 | mv /tmp/mockery /usr/local/bin/ && \ 24 | rm mockery_${MOCKERY_VERSION}_Linux_x86_64.tar.gz && \ 25 | \ 26 | wget https://github.com/princjef/gomarkdoc/releases/download/v${GOMARKDOC_VERSION}/gomarkdoc_${GOMARKDOC_VERSION}_linux_amd64.tar.gz && \ 27 | tar zxvf gomarkdoc_${GOMARKDOC_VERSION}_linux_amd64.tar.gz -C /tmp && \ 28 | mv /tmp/gomarkdoc_${GOMARKDOC_VERSION}_linux_amd64/gomarkdoc /usr/local/bin/ && \ 29 | rm -rf gomarkdoc_${GOMARKDOC_VERSION}_linux_amd64.tar.gz /tmp/gomarkdoc_${GOMARKDOC_VERSION}_linux_amd64 && \ 30 | \ 31 | wget https://get.helm.sh/helm-v${HELM_VERSION}-linux-amd64.tar.gz && \ 32 | tar zxvf helm-v${HELM_VERSION}-linux-amd64.tar.gz -C /tmp && \ 33 | mv /tmp/linux-amd64/helm /usr/local/bin/ && \ 34 | rm -rf helm-v${HELM_VERSION}-linux-amd64.tar.gz /tmp/linux-amd64 35 | 36 | 37 | # Create user. 38 | ARG uid=1000 39 | ARG gid=1000 40 | 41 | RUN bash -c 'if [ ${ostype} == Linux ]; then addgroup -gid $gid app; else addgroup app; fi && \ 42 | adduser --disabled-password -uid $uid --ingroup app --gecos "" app && \ 43 | chown app:app -R /go' 44 | 45 | # Fill Go apps cache: 46 | # Main app. 47 | RUN mkdir -p /tmp/cache 48 | COPY go.mod /tmp/cache 49 | COPY go.sum /tmp/cache 50 | RUN chown app:app -R /tmp/cache 51 | USER app 52 | RUN cd /tmp/cache && \ 53 | go mod download 54 | 55 | # Helm testing app. 56 | USER root 57 | RUN mkdir -p /tmp/cache 58 | COPY deploy/kubernetes/helm/sloth/tests/go.mod /tmp/cache 59 | COPY deploy/kubernetes/helm/sloth/tests/go.sum /tmp/cache 60 | RUN chown app:app -R /tmp/cache 61 | USER app 62 | RUN cd /tmp/cache && \ 63 | go mod download 64 | 65 | USER app 66 | WORKDIR /src 67 | -------------------------------------------------------------------------------- /docker/prod/Dockerfile: -------------------------------------------------------------------------------- 1 | # Set also `ARCH` ARG here so we can use it on all the `FROM`s. 2 | ARG ARCH 3 | 4 | FROM golang:1.23-alpine as build-stage 5 | 6 | LABEL org.opencontainers.image.source https://github.com/slok/sloth 7 | 8 | RUN apk --no-cache add \ 9 | g++ \ 10 | git \ 11 | make \ 12 | curl \ 13 | bash 14 | 15 | # Required by the built script for setting verion and cross-compiling. 16 | ARG VERSION 17 | ENV VERSION=${VERSION} 18 | ARG ARCH 19 | ENV GOARCH=${ARCH} 20 | 21 | # Compile. 22 | WORKDIR /src 23 | COPY . . 24 | RUN ./scripts/build/bin/build-raw.sh 25 | 26 | 27 | # Although we are on an specific architecture (normally linux/amd64) our go binary has been built for 28 | # ${ARCH} specific architecture. 29 | # To make portable our building process we base our final image on that same architecture as the binary 30 | # to obtain a resulting ${ARCH} image independently where we are building this image. 31 | FROM gcr.io/distroless/static:nonroot-${ARCH} 32 | 33 | COPY --from=build-stage /src/bin/sloth /usr/local/bin/sloth 34 | 35 | ENTRYPOINT ["/usr/local/bin/sloth"] 36 | -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linode-obs/sloth/a80a2f7e75d32e40e2cb2a2eaf205c0f8b3e97f0/docs/img/logo.png -------------------------------------------------------------------------------- /docs/img/sloth_small_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linode-obs/sloth/a80a2f7e75d32e40e2cb2a2eaf205c0f8b3e97f0/docs/img/sloth_small_dashboard.png -------------------------------------------------------------------------------- /examples/custom_rule_group_interval.yml: -------------------------------------------------------------------------------- 1 | # This example shows how you can adjust the Prometheus rule_group interval for expensive SLOs 2 | # https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group 3 | # The SLO SLI measures the rate of CPU seconds spent performing softirqs 4 | # 5 | # `sloth generate -i ./examples/custom_rule_group_interval.yml` 6 | # 7 | version: "prometheus/v1" 8 | service: "myapp" 9 | labels: 10 | owner: "myteam" 11 | slos: 12 | - name: "cpu-availability" 13 | objective: 99.99 14 | description: "Example, expensive SLO. Recording rules will run every 2 minutes." 15 | # alternative way of specifying interval for all three sets of rules 16 | # interval: 17 | # all: "5m" 18 | interval: # all of these are different sets of rule groups sloth can make 19 | slierror: "4m" 20 | metadata: "2m" 21 | alert: "2m" 22 | sli: 23 | events: 24 | error_query: | 25 | sum( 26 | rate(node_cpu_seconds_total{mode="softirq"}[{{.window}}]) 27 | ) 28 | total_query: | 29 | sum( 30 | rate(node_cpu_seconds_total[{{.window}}]) 31 | ) 32 | alerting: 33 | name: MyServiceHighErrorRate 34 | labels: 35 | category: "availability" 36 | annotations: 37 | summary: "High error rate on 'myservice' requests responses" 38 | page_alert: 39 | labels: 40 | severity: pageteam 41 | routing_key: myteam 42 | ticket_alert: 43 | disable: true 44 | -------------------------------------------------------------------------------- /examples/getting-started.yml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "myservice" 3 | labels: 4 | owner: "myteam" 5 | repo: "myorg/myservice" 6 | tier: "2" 7 | slos: 8 | # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). 9 | - name: "requests-availability" 10 | objective: 99.9 11 | description: "Common SLO based on availability for HTTP request responses." 12 | sli: 13 | events: 14 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 15 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 16 | alerting: 17 | name: MyServiceHighErrorRate 18 | labels: 19 | category: "availability" 20 | annotations: 21 | # Overwrite default Sloth SLO alert summmary on ticket and page alerts. 22 | summary: "High error rate on 'myservice' requests responses" 23 | page_alert: 24 | labels: 25 | severity: pageteam 26 | routing_key: myteam 27 | ticket_alert: 28 | labels: 29 | severity: "slack" 30 | slack_channel: "#alerts-myteam" 31 | -------------------------------------------------------------------------------- /examples/home-wifi.yml: -------------------------------------------------------------------------------- 1 | # This example shows a real service level used in my home to have SLOs on my wifi signal. 2 | # The metrics are extracted using unifi-poller (https://github.com/unifi-poller/unifi-poller) 3 | # that gets the information from an Ubiquiti Wifi installation. 4 | # https://community.ui.com/questions/satisfaction-percentage-in-client-properties-overview/8c940637-63d0-41de-a67b-8166cdd0ed32 5 | # 6 | # The service level has 2 SLOs based on `client_satisfaction_ratio`, this is a ratio calculated 7 | # by ubiquiti that is based on wifi drop packages, wifi signal... 8 | # We conside an SLI event the client satisfactions that currently exist, lets review the SLOs 9 | # 10 | # - `good-wifi-client-satisfaction` 11 | # - This SLO warn us that we don't have a good wifi at home. 12 | # - SLI error: We consider a bad client satisfaction (event) below 75% (0.75) 13 | # - SLO objective (95%): We are not so restrictive and we allow that that 5 of every 100 clients be below 75% 14 | # 15 | # - `risk-wifi-client-satisfaction` 16 | # - This SLO warn us that we something very bad is happenning with our home wifi. 17 | # - SLI error: We consider a bad client satisfaction (event) below 50% (0.5) 18 | # - SLO objective(99.9%): We are very restrictive and we allow that that 1 of every 1000 clients be below 50% 19 | # 20 | # `sloth generate -i ./examples/home-wifi.yml` 21 | # 22 | version: "prometheus/v1" 23 | service: "home-wifi" 24 | labels: 25 | cluster: "valhalla" 26 | component: "ubiquiti" 27 | context: "home" 28 | slos: 29 | - name: "good-wifi-client-satisfaction" 30 | objective: 95 31 | description: "Will warn us that we don't have a good wifi at home." 32 | sli: 33 | events: 34 | error_query: sum_over_time((count(unifipoller_client_satisfaction_ratio < 0.75))[{{.window}}:]) OR on() vector(0) 35 | total_query: sum_over_time((count(unifipoller_client_satisfaction_ratio))[{{.window}}:]) 36 | alerting: 37 | name: GoodWifiClientSatisfaction 38 | page_alert: 39 | labels: 40 | severity: home 41 | ticket_alert: 42 | labels: 43 | severity: warning 44 | 45 | - name: "risk-wifi-client-satisfaction" 46 | objective: 99.9 47 | description: "Will warn us that we something very bad is happenning with our home wifi." 48 | sli: 49 | events: 50 | error_query: sum_over_time((count(unifipoller_client_satisfaction_ratio < 0.5))[{{.window}}:]) OR on() vector(0) 51 | total_query: sum_over_time((count(unifipoller_client_satisfaction_ratio))[{{.window}}:]) 52 | alerting: 53 | name: RiskWifiClientSatisfaction 54 | page_alert: 55 | labels: 56 | severity: home 57 | ticket_alert: 58 | labels: 59 | severity: warning 60 | -------------------------------------------------------------------------------- /examples/k8s-getting-started.yml: -------------------------------------------------------------------------------- 1 | # This example shows the same example as getting-started.yml but using Sloth Kubernetes CRD. 2 | # It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD. 3 | # 4 | # `sloth generate -i ./examples/k8s-getting-started.yml` 5 | # 6 | apiVersion: sloth.slok.dev/v1 7 | kind: PrometheusServiceLevel 8 | metadata: 9 | name: sloth-slo-my-service 10 | namespace: monitoring 11 | spec: 12 | service: "myservice" 13 | labels: 14 | owner: "myteam" 15 | repo: "myorg/myservice" 16 | tier: "2" 17 | slos: 18 | - name: "requests-availability" 19 | objective: 99.9 20 | description: "Common SLO based on availability for HTTP request responses." 21 | sli: 22 | events: 23 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 24 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 25 | alerting: 26 | name: MyServiceHighErrorRate 27 | labels: 28 | category: "availability" 29 | annotations: 30 | summary: "High error rate on 'myservice' requests responses" 31 | pageAlert: 32 | labels: 33 | severity: pageteam 34 | routing_key: myteam 35 | ticketAlert: 36 | labels: 37 | severity: "slack" 38 | slack_channel: "#alerts-myteam" 39 | -------------------------------------------------------------------------------- /examples/k8s-home-wifi.yml: -------------------------------------------------------------------------------- 1 | # This example shows the same example as home-wifi.yml but using Sloth Kubernetes CRD. 2 | # It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD. 3 | # 4 | # `sloth generate -i ./examples/k8s-home-wifi.yml` 5 | # 6 | apiVersion: sloth.slok.dev/v1 7 | kind: PrometheusServiceLevel 8 | metadata: 9 | name: sloth-slo-home-wifi 10 | namespace: monitoring 11 | labels: 12 | prometheus: prometheus 13 | role: alert-rules 14 | app: sloth 15 | spec: 16 | service: "home-wifi" 17 | labels: 18 | cluster: "valhalla" 19 | component: "ubiquiti" 20 | context: "home" 21 | slos: 22 | - name: "good-wifi-client-satisfaction" 23 | objective: 95 24 | description: "Will warn us that we don't have a good wifi at home." 25 | sli: 26 | events: 27 | errorQuery: sum_over_time((count(unifipoller_client_satisfaction_ratio < 0.75))[{{.window}}:]) OR on() vector(0) 28 | totalQuery: sum_over_time((count(unifipoller_client_satisfaction_ratio))[{{.window}}:]) 29 | alerting: 30 | name: GoodWifiClientSatisfaction 31 | pageAlert: 32 | labels: 33 | severity: home 34 | ticketAlert: 35 | labels: 36 | severity: warning 37 | 38 | - name: "risk-wifi-client-satisfaction" 39 | objective: 99.9 40 | description: "Will warn us that we something very bad is happenning with our home wifi." 41 | sli: 42 | events: 43 | errorQuery: sum_over_time((count(unifipoller_client_satisfaction_ratio < 0.5))[{{.window}}:]) OR on() vector(0) 44 | totalQuery: sum_over_time((count(unifipoller_client_satisfaction_ratio))[{{.window}}:]) 45 | alerting: 46 | name: RiskWifiClientSatisfaction 47 | pageAlert: 48 | labels: 49 | severity: home 50 | ticketAlert: 51 | labels: 52 | severity: warning 53 | -------------------------------------------------------------------------------- /examples/k8s-multifile.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This example shows the same example as getting-started.yml but using Sloth Kubernetes CRD and multifile. 3 | # It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD. 4 | # 5 | # `sloth generate -i ./examples/k8s-multifile.yml` 6 | # 7 | apiVersion: sloth.slok.dev/v1 8 | kind: PrometheusServiceLevel 9 | metadata: 10 | name: sloth-slo-my-service 11 | namespace: monitoring 12 | spec: 13 | service: "myservice" 14 | labels: 15 | owner: "myteam" 16 | repo: "myorg/myservice" 17 | tier: "2" 18 | slos: 19 | - name: "requests-availability" 20 | objective: 99.9 21 | description: "Common SLO based on availability for HTTP request responses." 22 | sli: 23 | events: 24 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 25 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 26 | alerting: 27 | name: MyServiceHighErrorRate 28 | labels: 29 | category: "availability" 30 | annotations: 31 | summary: "High error rate on 'myservice' requests responses" 32 | pageAlert: 33 | labels: 34 | severity: pageteam 35 | routing_key: myteam 36 | ticketAlert: 37 | labels: 38 | severity: "slack" 39 | slack_channel: "#alerts-myteam" 40 | --- 41 | apiVersion: sloth.slok.dev/v1 42 | kind: PrometheusServiceLevel 43 | metadata: 44 | name: sloth-slo-my-service2 45 | namespace: monitoring 46 | spec: 47 | service: "myservice2" 48 | labels: 49 | owner: "myteam2" 50 | repo: "myorg/myservice2" 51 | tier: "1" 52 | slos: 53 | - name: "requests-availability" 54 | objective: 99.99 55 | description: "Common SLO based on availability for HTTP request responses." 56 | sli: 57 | events: 58 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 59 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 60 | alerting: 61 | name: MyServiceHighErrorRate 62 | labels: 63 | category: "availability" 64 | annotations: 65 | summary: "High error rate on 'myservice' requests responses" 66 | pageAlert: 67 | labels: 68 | severity: pageteam 69 | routing_key: myteam 70 | ticketAlert: 71 | labels: 72 | severity: "slack" 73 | slack_channel: "#alerts-myteam" 74 | -------------------------------------------------------------------------------- /examples/kubernetes-apiserver.yml: -------------------------------------------------------------------------------- 1 | # This example shows a real service level used for Kubernetes Apiserver. 2 | # 3 | # The service level has 2 SLOs based on Apiserver requests/responses. 4 | # 5 | # We consider an SLI event the the requests made to the server, lets review the SLOs 6 | # 7 | # - `requests-availability` 8 | # - This SLO warn us that we are returning correctly the requests to the clients (kubectl users, controllers...). 9 | # - SLI error: We consider a bad request (event) a request with the codes >=500 or 429 10 | # - SLO objective (99.9%): We are restrictive with this because we only allow failing a request every 1000. 11 | # 12 | # - `requests-latency` 13 | # - This SLO warn us that we apiserver responses are being slow and this will affect the clients (kubectl users, controllers...). 14 | # - SLI error: We consider a bad request (event) when the response latency is <400ms. 15 | # - SLO objective(99%): We have a relaxed objective because Kubernetes has a lot of async and eventual consistency flows. We could 16 | # create in a future another SLO that is less restrictive and use the latency of the realtime requests (e.g: kubectl). 17 | # 18 | # `sloth generate -i ./examples/kubernetes-apiserver.yml` 19 | # 20 | version: "prometheus/v1" 21 | service: "k8s-apiserver" 22 | labels: 23 | cluster: "valhalla" 24 | component: "kubernetes" 25 | slos: 26 | - name: "requests-availability" 27 | objective: 99.9 28 | description: "Warn that we are returning correctly the requests to the clients (kubectl users, controllers...)." 29 | labels: 30 | category: availability 31 | sli: 32 | events: 33 | error_query: sum(rate(apiserver_request_total{code=~"(5..|429)"}[{{.window}}])) 34 | total_query: sum(rate(apiserver_request_total[{{.window}}])) 35 | alerting: 36 | name: K8sApiserverAvailabilityAlert 37 | labels: 38 | category: "availability" 39 | annotations: 40 | runbook: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" 41 | page_alert: 42 | labels: 43 | severity: critical 44 | ticket_alert: 45 | labels: 46 | severity: warning 47 | 48 | - name: "requests-latency" 49 | objective: 99 50 | description: "Warn that we apiserver responses are being slow and this will affect the clients (kubectl users, controllers...)." 51 | labels: 52 | category: latency 53 | sli: 54 | events: 55 | error_query: | 56 | ( 57 | sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}])) 58 | - 59 | sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[{{.window}}])) 60 | ) 61 | total_query: sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}])) 62 | alerting: 63 | name: K8sApiserverLatencyAlert 64 | labels: 65 | category: "latency" 66 | annotations: 67 | runbook: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" 68 | page_alert: 69 | labels: 70 | severity: critical 71 | ticket_alert: 72 | labels: 73 | severity: warning 74 | -------------------------------------------------------------------------------- /examples/multifile.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: "prometheus/v1" 3 | service: "myservice" 4 | labels: 5 | owner: "myteam" 6 | repo: "myorg/myservice" 7 | tier: "2" 8 | slos: 9 | # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). 10 | - name: "requests-availability" 11 | objective: 99.9 12 | description: "Common SLO based on availability for HTTP request responses." 13 | sli: 14 | events: 15 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 16 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 17 | alerting: 18 | name: MyServiceHighErrorRate 19 | labels: 20 | category: "availability" 21 | annotations: 22 | # Overwrite default Sloth SLO alert summmary on ticket and page alerts. 23 | summary: "High error rate on 'myservice' requests responses" 24 | page_alert: 25 | labels: 26 | severity: pageteam 27 | routing_key: myteam 28 | ticket_alert: 29 | labels: 30 | severity: "slack" 31 | slack_channel: "#alerts-myteam" 32 | 33 | --- 34 | version: "prometheus/v1" 35 | service: "myservice2" 36 | labels: 37 | owner: "myteam2" 38 | repo: "myorg/myservice2" 39 | tier: "1" 40 | slos: 41 | # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). 42 | - name: "requests-availability" 43 | objective: 99.99 44 | description: "Common SLO based on availability for HTTP request responses." 45 | sli: 46 | events: 47 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 48 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 49 | alerting: 50 | name: MyServiceHighErrorRate 51 | labels: 52 | category: "availability" 53 | annotations: 54 | # Overwrite default Sloth SLO alert summmary on ticket and page alerts. 55 | summary: "High error rate on 'myservice' requests responses" 56 | page_alert: 57 | labels: 58 | severity: pageteam 59 | routing_key: myteam 60 | ticket_alert: 61 | labels: 62 | severity: "slack" 63 | slack_channel: "#alerts-myteam" 64 | -------------------------------------------------------------------------------- /examples/no-alerts.yml: -------------------------------------------------------------------------------- 1 | # This example shows a simple service level by implementing a single SLO without alerts. 2 | # It disables page (critical) and ticket (warning) alerts. 3 | # The SLO SLI measures the event errors as the http request respones with the code >=500 and 429. 4 | # 5 | # `sloth generate -i ./examples/no-alerts.yml` 6 | # 7 | version: "prometheus/v1" 8 | service: "myapp" 9 | labels: 10 | owner: "myteam" 11 | slos: 12 | - name: "http-availability" 13 | objective: 99.99 14 | description: "Common SLO based on availability for HTTP request responses." 15 | sli: 16 | events: 17 | error_query: | 18 | sum( 19 | rate(http_request_duration_seconds_count{job="myapp", code=~"(5..|429)"}[{{.window}}]) 20 | ) 21 | total_query: | 22 | sum( 23 | rate(http_request_duration_seconds_count{job="myapp"}[{{.window}}]) 24 | ) 25 | alerting: 26 | page_alert: 27 | disable: true 28 | ticket_alert: 29 | disable: true 30 | -------------------------------------------------------------------------------- /examples/openslo-getting-started.yml: -------------------------------------------------------------------------------- 1 | # This example shows the same example as getting-started.yml but using OpenSLO spec. 2 | # It will generate the Prometheus rules in a Prometheus rules format. 3 | # 4 | # `sloth generate -i ./examples/openslo-getting-started.yml` 5 | # 6 | apiVersion: openslo/v1alpha 7 | kind: SLO 8 | metadata: 9 | name: sloth-slo-my-service 10 | displayName: Requests Availability 11 | spec: 12 | service: my-service 13 | description: "Common SLO based on availability for HTTP request responses." 14 | budgetingMethod: Occurrences 15 | objectives: 16 | - ratioMetrics: 17 | good: 18 | source: prometheus 19 | queryType: promql 20 | query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}])) 21 | total: 22 | source: prometheus 23 | queryType: promql 24 | query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 25 | target: 0.999 26 | timeWindows: 27 | - count: 30 28 | unit: Day 29 | -------------------------------------------------------------------------------- /examples/openslo-kubernetes-apiserver.yml: -------------------------------------------------------------------------------- 1 | # This example shows the same example as kubernetes-apiserver.yml but using OpenSLO spec. 2 | # It will generate the Prometheus rules in a Prometheus rules format. 3 | # 4 | # Take into account that OpenSLO spec has the concept of single SLO with multiple objectives 5 | # 6 | # `sloth generate -i ./examples/openslo-kubernetes-apiserver.yml` 7 | # 8 | apiVersion: openslo/v1alpha 9 | kind: SLO 10 | metadata: 11 | name: requests-availability-openslo 12 | displayName: Requests Availability 13 | spec: 14 | service: k8s-apiserver 15 | description: "Apiserver are returning correctly the requests to the clients (kubectl users, controllers...)." 16 | budgetingMethod: Occurrences 17 | objectives: 18 | - ratioMetrics: 19 | good: 20 | source: prometheus 21 | queryType: promql 22 | query: sum(rate(apiserver_request_total{code!~"(5..|429)"}[{{.window}}])) 23 | total: 24 | source: prometheus 25 | queryType: promql 26 | query: sum(rate(apiserver_request_total[{{.window}}])) 27 | target: 0.999 28 | 29 | timeWindows: 30 | - count: 30 31 | unit: Day 32 | 33 | --- 34 | apiVersion: openslo/v1alpha 35 | kind: SLO 36 | metadata: 37 | name: requests-latency-openslo 38 | displayName: Requests Latency 39 | spec: 40 | service: k8s-apiserver 41 | description: "Apiserver responses are being fast enough and this will affect the clients (kubectl users, controllers...)." 42 | budgetingMethod: Occurrences 43 | objectives: 44 | - ratioMetrics: 45 | good: 46 | source: prometheus 47 | queryType: promql 48 | query: sum(rate(apiserver_request_duration_seconds_bucket{le="0.4",verb!="WATCH"}[{{.window}}])) 49 | total: 50 | source: prometheus 51 | queryType: promql 52 | query: sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}])) 53 | target: 0.99 54 | 55 | - ratioMetrics: 56 | good: 57 | source: prometheus 58 | queryType: promql 59 | query: sum(rate(apiserver_request_duration_seconds_bucket{le="5",verb!="WATCH"}[{{.window}}])) 60 | total: 61 | source: prometheus 62 | queryType: promql 63 | query: sum(rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[{{.window}}])) 64 | target: 0.999 65 | 66 | timeWindows: 67 | - count: 30 68 | unit: Day 69 | -------------------------------------------------------------------------------- /examples/plugin-getting-started.yml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "myservice" 3 | labels: 4 | owner: "myteam" 5 | repo: "myorg/myservice" 6 | tier: "2" 7 | slos: 8 | # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). 9 | - name: "requests-availability" 10 | objective: 99.9 11 | description: "Common SLO based on availability for HTTP request responses." 12 | sli: 13 | plugin: 14 | id: "getting_started_availability" 15 | options: 16 | job: "myservice" 17 | filter: 'f1="v1",f2="v2"' 18 | alerting: 19 | name: MyServiceHighErrorRate 20 | labels: 21 | category: "availability" 22 | annotations: 23 | # Overwrite default Sloth SLO alert summmary on ticket and page alerts. 24 | summary: "High error rate on 'myservice' requests responses" 25 | page_alert: 26 | labels: 27 | severity: pageteam 28 | routing_key: myteam 29 | ticket_alert: 30 | labels: 31 | severity: "slack" 32 | slack_channel: "#alerts-myteam" 33 | -------------------------------------------------------------------------------- /examples/plugin-k8s-getting-started.yml: -------------------------------------------------------------------------------- 1 | # This example shows the same example as home-wifi.yml but using Sloth Kubernetes CRD. 2 | # It will generate the Prometheus rules in a Kubernetes prometheus-operator PrometheusRules CRD. 3 | # 4 | # `sloth generate -i ./examples/plugin-k8s-home-wifi.yml` -p ./examples 5 | # 6 | apiVersion: sloth.slok.dev/v1 7 | kind: PrometheusServiceLevel 8 | metadata: 9 | name: sloth-slo-home-wifi 10 | namespace: monitoring 11 | labels: 12 | prometheus: prometheus 13 | role: alert-rules 14 | app: sloth 15 | spec: 16 | service: "myservice" 17 | labels: 18 | owner: "myteam" 19 | repo: "myorg/myservice" 20 | tier: "2" 21 | slos: 22 | # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). 23 | - name: "requests-availability" 24 | objective: 99.9 25 | description: "Common SLO based on availability for HTTP request responses." 26 | sli: 27 | plugin: 28 | id: "getting_started_availability" 29 | options: 30 | job: "myservice" 31 | filter: 'f1="v1",f2="v2"' 32 | alerting: 33 | name: MyServiceHighErrorRate 34 | labels: 35 | category: "availability" 36 | annotations: 37 | # Overwrite default Sloth SLO alert summmary on ticket and page alerts. 38 | summary: "High error rate on 'myservice' requests responses" 39 | page_alert: 40 | labels: 41 | severity: pageteam 42 | routing_key: myteam 43 | ticket_alert: 44 | labels: 45 | severity: "slack" 46 | slack_channel: "#alerts-myteam" 47 | -------------------------------------------------------------------------------- /examples/plugins/getting-started/availability/plugin.go: -------------------------------------------------------------------------------- 1 | package availability 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "regexp" 8 | "strings" 9 | "text/template" 10 | ) 11 | 12 | const ( 13 | SLIPluginVersion = "prometheus/v1" 14 | SLIPluginID = "getting_started_availability" 15 | ) 16 | 17 | var queryTpl = template.Must(template.New("").Parse(` 18 | sum(rate(http_request_duration_seconds_count{ {{.filter}}job="{{.job}}",code=~"(5..|429)" }[{{"{{.window}}"}}])) 19 | / 20 | sum(rate(http_request_duration_seconds_count{ {{.filter}}job="{{.job}}" }[{{"{{.window}}"}}]))`)) 21 | 22 | var filterRegex = regexp.MustCompile(`([^=]+="[^=,"]+",)+`) 23 | 24 | // SLIPlugin is the getting started plugin example. 25 | // 26 | // It will return an Sloth error ratio raw query that returns the error ratio of HTTP requests based 27 | // on the HTTP response status code, taking 5xx and 429 as error events. 28 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 29 | // Get job. 30 | job, ok := options["job"] 31 | if !ok { 32 | return "", fmt.Errorf("job options is required") 33 | } 34 | 35 | // Validate labels. 36 | err := validateLabels(labels, "owner", "tier") 37 | if err != nil { 38 | return "", fmt.Errorf("invalid labels: %w", err) 39 | } 40 | 41 | // Sanitize filter. 42 | filter := options["filter"] 43 | if filter != "" { 44 | filter = strings.Trim(filter, "{}") 45 | filter = strings.Trim(filter, ",") 46 | filter = filter + "," 47 | match := filterRegex.MatchString(filter) 48 | if !match { 49 | return "", fmt.Errorf("invalid prometheus filter: %s", filter) 50 | } 51 | } 52 | 53 | // Create query. 54 | var b bytes.Buffer 55 | data := map[string]string{ 56 | "job": job, 57 | "filter": filter, 58 | } 59 | err = queryTpl.Execute(&b, data) 60 | if err != nil { 61 | return "", fmt.Errorf("could not execute template: %w", err) 62 | } 63 | 64 | return b.String(), nil 65 | } 66 | 67 | // validateLabels will check the labels exist. 68 | func validateLabels(labels map[string]string, requiredKeys ...string) error { 69 | for _, k := range requiredKeys { 70 | v, ok := labels[k] 71 | if !ok || (ok && v == "") { 72 | return fmt.Errorf("%q label is required", k) 73 | } 74 | } 75 | 76 | return nil 77 | } 78 | -------------------------------------------------------------------------------- /examples/raw-home-wifi.yml: -------------------------------------------------------------------------------- 1 | # This example shows another less accurate or simpler way of creating the home wifi SLO. 2 | # 3 | # The metrics already give us a metric in ratio for each wifi connection satisfaction, instead of getting 4 | # good and bad events as connection with a minimum satisfaction ratio, we will calculate the averate of all 5 | # ratio satisfaction connections over the time window. 6 | # So we can't use the `events` SLI because we are not going to divide bad and total events. 7 | # 8 | # - `wifi-client-satisfaction` 9 | # - This SLO warn us that we have an average wifi connection satisfaction. 10 | # - SLI error: Calculated internally by ubiquitis metrics, we use directly the ratio. 11 | # - SLO objective (95%): We allow the average wifi connection satisfaction is >=95% 12 | # 13 | # `sloth generate -i ./examples/raw-home-wifi.yml` 14 | # 15 | version: "prometheus/v1" 16 | service: "home-wifi" 17 | labels: 18 | cluster: "valhalla" 19 | component: "ubiquiti" 20 | context: "home" 21 | slos: 22 | - name: "wifi-client-satisfaction" 23 | objective: 95 24 | description: "Warn us that we have an average wifi connection satisfaction." 25 | sli: 26 | raw: 27 | # Get the averate satisfaction ratio and rest 1 (max good) to get the error ratio. 28 | error_ratio_query: | 29 | 1 - ( 30 | sum(sum_over_time(unifipoller_client_satisfaction_ratio[{{.window}}])) 31 | / 32 | sum(count_over_time(unifipoller_client_satisfaction_ratio[{{.window}}])) 33 | ) 34 | alerting: 35 | name: WifiClientSatisfaction 36 | page_alert: 37 | labels: 38 | severity: home 39 | ticket_alert: 40 | labels: 41 | severity: warning 42 | -------------------------------------------------------------------------------- /examples/windows/7d.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: AlertWindows 3 | spec: 4 | sloPeriod: 7d 5 | page: 6 | quick: 7 | errorBudgetPercent: 8 8 | shortWindow: 5m 9 | longWindow: 1h 10 | slow: 11 | errorBudgetPercent: 12.5 12 | shortWindow: 30m 13 | longWindow: 6h 14 | ticket: 15 | quick: 16 | errorBudgetPercent: 20 17 | shortWindow: 2h 18 | longWindow: 1d 19 | slow: 20 | errorBudgetPercent: 42 21 | shortWindow: 6h 22 | longWindow: 3d 23 | -------------------------------------------------------------------------------- /examples/windows/custom-30d.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: AlertWindows 3 | spec: 4 | sloPeriod: 30d 5 | page: 6 | quick: 7 | errorBudgetPercent: 1 8 | shortWindow: 2m 9 | longWindow: 30m 10 | slow: 11 | errorBudgetPercent: 2 12 | shortWindow: 15m 13 | longWindow: 3h 14 | ticket: 15 | quick: 16 | errorBudgetPercent: 5 17 | shortWindow: 1h 18 | longWindow: 12h 19 | slow: 20 | errorBudgetPercent: 5 21 | shortWindow: 3h 22 | longWindow: 36h 23 | -------------------------------------------------------------------------------- /internal/alert/alert.go: -------------------------------------------------------------------------------- 1 | package alert 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | ) 8 | 9 | // Severity is the type of alert. 10 | type Severity int 11 | 12 | const ( 13 | UnknownAlertSeverity Severity = iota 14 | PageAlertSeverity 15 | TicketAlertSeverity 16 | ) 17 | 18 | func (s Severity) String() string { 19 | switch s { 20 | case PageAlertSeverity: 21 | return "page" 22 | case TicketAlertSeverity: 23 | return "ticket" 24 | default: 25 | return "unknown" 26 | } 27 | } 28 | 29 | // MWMBAlert represents a multiwindow, multi-burn rate alert. 30 | type MWMBAlert struct { 31 | ID string 32 | ShortWindow time.Duration 33 | LongWindow time.Duration 34 | BurnRateFactor float64 35 | ErrorBudget float64 36 | Severity Severity 37 | } 38 | 39 | // MWMBAlertGroup what represents all the alerts of an SLO. 40 | // ITs divided into two groups that are made of 2 alerts: 41 | // - Page & quick: Critical alerts that trigger in high rate burn in short term. 42 | // - Page & slow: Critical alerts that trigger in high-normal rate burn in medium term. 43 | // - Ticket & slow: Warning alerts that trigger in normal rate burn in medium term. 44 | // - Ticket & slow: Warning alerts that trigger in slow rate burn in long term. 45 | type MWMBAlertGroup struct { 46 | PageQuick MWMBAlert 47 | PageSlow MWMBAlert 48 | TicketQuick MWMBAlert 49 | TicketSlow MWMBAlert 50 | } 51 | 52 | // WindowsRepo knows how to retrieve windows based on the period of time. 53 | type WindowsRepo interface { 54 | GetWindows(ctx context.Context, period time.Duration) (*Windows, error) 55 | } 56 | 57 | // Generator knows how to generate all the required alerts based on an SLO. 58 | // The generated alerts are generic and don't depend on any specific SLO implementation. 59 | type Generator struct { 60 | windowsRepo WindowsRepo 61 | } 62 | 63 | func NewGenerator(windowsRepo WindowsRepo) Generator { 64 | return Generator{ 65 | windowsRepo: windowsRepo, 66 | } 67 | } 68 | 69 | type SLO struct { 70 | ID string 71 | TimeWindow time.Duration 72 | Objective float64 73 | } 74 | 75 | func (g Generator) GenerateMWMBAlerts(ctx context.Context, slo SLO) (*MWMBAlertGroup, error) { 76 | windows, err := g.windowsRepo.GetWindows(ctx, slo.TimeWindow) 77 | if err != nil { 78 | return nil, fmt.Errorf("the %s SLO period time window is not supported", slo.TimeWindow) 79 | } 80 | 81 | errorBudget := 100 - slo.Objective 82 | 83 | group := MWMBAlertGroup{ 84 | PageQuick: MWMBAlert{ 85 | ID: fmt.Sprintf("%s-page-quick", slo.ID), 86 | ShortWindow: windows.PageQuick.ShortWindow, 87 | LongWindow: windows.PageQuick.LongWindow, 88 | BurnRateFactor: windows.GetSpeedPageQuick(), 89 | ErrorBudget: errorBudget, 90 | Severity: PageAlertSeverity, 91 | }, 92 | PageSlow: MWMBAlert{ 93 | ID: fmt.Sprintf("%s-page-slow", slo.ID), 94 | ShortWindow: windows.PageSlow.ShortWindow, 95 | LongWindow: windows.PageSlow.LongWindow, 96 | BurnRateFactor: windows.GetSpeedPageSlow(), 97 | ErrorBudget: errorBudget, 98 | Severity: PageAlertSeverity, 99 | }, 100 | TicketQuick: MWMBAlert{ 101 | ID: fmt.Sprintf("%s-ticket-quick", slo.ID), 102 | ShortWindow: windows.TicketQuick.ShortWindow, 103 | LongWindow: windows.TicketQuick.LongWindow, 104 | BurnRateFactor: windows.GetSpeedTicketQuick(), 105 | ErrorBudget: errorBudget, 106 | Severity: TicketAlertSeverity, 107 | }, 108 | TicketSlow: MWMBAlert{ 109 | ID: fmt.Sprintf("%s-ticket-slow", slo.ID), 110 | ShortWindow: windows.TicketSlow.ShortWindow, 111 | LongWindow: windows.TicketSlow.LongWindow, 112 | BurnRateFactor: windows.GetSpeedTicketSlow(), 113 | ErrorBudget: errorBudget, 114 | Severity: TicketAlertSeverity, 115 | }, 116 | } 117 | 118 | return &group, nil 119 | } 120 | -------------------------------------------------------------------------------- /internal/alert/windows/google-28d.yaml: -------------------------------------------------------------------------------- 1 | # Common and safe 4 weeks windows. 2 | # 3 | # Numbers obtained from https://sre.google/workbook/alerting-on-slos/#recommended_parameters_for_an_slo_based_a. 4 | apiVersion: "sloth.slok.dev/v1" 5 | kind: "AlertWindows" 6 | spec: 7 | sloPeriod: 28d 8 | page: 9 | quick: 10 | errorBudgetPercent: 2 11 | shortWindow: 5m 12 | longWindow: 1h 13 | slow: 14 | errorBudgetPercent: 5 15 | shortWindow: 30m 16 | longWindow: 6h 17 | ticket: 18 | quick: 19 | errorBudgetPercent: 10 20 | shortWindow: 2h 21 | longWindow: 1d 22 | slow: 23 | errorBudgetPercent: 10 24 | shortWindow: 6h 25 | longWindow: 3d -------------------------------------------------------------------------------- /internal/alert/windows/google-30d.yaml: -------------------------------------------------------------------------------- 1 | # Common and safe month windows. 2 | # 3 | # Numbers obtained from https://sre.google/workbook/alerting-on-slos/#recommended_parameters_for_an_slo_based_a. 4 | apiVersion: "sloth.slok.dev/v1" 5 | kind: "AlertWindows" 6 | spec: 7 | sloPeriod: 30d 8 | page: 9 | quick: 10 | errorBudgetPercent: 2 11 | shortWindow: 5m 12 | longWindow: 1h 13 | slow: 14 | errorBudgetPercent: 5 15 | shortWindow: 30m 16 | longWindow: 6h 17 | ticket: 18 | quick: 19 | errorBudgetPercent: 10 20 | shortWindow: 2h 21 | longWindow: 1d 22 | slow: 23 | errorBudgetPercent: 10 24 | shortWindow: 6h 25 | longWindow: 3d 26 | -------------------------------------------------------------------------------- /internal/app/generate/noop.go: -------------------------------------------------------------------------------- 1 | package generate 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/prometheus/prometheus/model/rulefmt" 7 | "github.com/slok/sloth/internal/alert" 8 | "github.com/slok/sloth/internal/info" 9 | "github.com/slok/sloth/internal/prometheus" 10 | ) 11 | 12 | type noopSLIRecordingRulesGenerator bool 13 | 14 | const NoopSLIRecordingRulesGenerator = noopSLIRecordingRulesGenerator(false) 15 | 16 | func (noopSLIRecordingRulesGenerator) GenerateSLIRecordingRules(ctx context.Context, slo prometheus.SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) { 17 | return nil, nil 18 | } 19 | 20 | type noopMetadataRecordingRulesGenerator bool 21 | 22 | const NoopMetadataRecordingRulesGenerator = noopMetadataRecordingRulesGenerator(false) 23 | 24 | func (noopMetadataRecordingRulesGenerator) GenerateMetadataRecordingRules(ctx context.Context, info info.Info, slo prometheus.SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) { 25 | return nil, nil 26 | } 27 | 28 | type noopSLOAlertRulesGenerator bool 29 | 30 | const NoopSLOAlertRulesGenerator = noopSLOAlertRulesGenerator(false) 31 | 32 | func (noopSLOAlertRulesGenerator) GenerateSLOAlertRules(ctx context.Context, slo prometheus.SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) { 33 | return nil, nil 34 | } 35 | -------------------------------------------------------------------------------- /internal/app/kubecontroller/retriever.go: -------------------------------------------------------------------------------- 1 | package kubecontroller 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/spotahome/kooper/v2/controller" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | "k8s.io/apimachinery/pkg/labels" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | "k8s.io/apimachinery/pkg/watch" 11 | "k8s.io/client-go/tools/cache" 12 | 13 | slothv1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" 14 | ) 15 | 16 | // RetrieverKubernetesRepository is the service to manage k8s resources by the Kubernetes controller retrievers. 17 | type RetrieverKubernetesRepository interface { 18 | ListPrometheusServiceLevels(ctx context.Context, ns string, opts metav1.ListOptions) (*slothv1.PrometheusServiceLevelList, error) 19 | WatchPrometheusServiceLevels(ctx context.Context, ns string, opts metav1.ListOptions) (watch.Interface, error) 20 | } 21 | 22 | // NewPrometheusServiceLevelsRetriver returns the retriever for Prometheus service levels events. 23 | func NewPrometheusServiceLevelsRetriver(ns string, labelSelector labels.Selector, repo RetrieverKubernetesRepository) controller.Retriever { 24 | return controller.MustRetrieverFromListerWatcher(&cache.ListWatch{ 25 | ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 26 | options.LabelSelector = labelSelector.String() 27 | return repo.ListPrometheusServiceLevels(context.Background(), ns, options) 28 | }, 29 | WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 30 | options.LabelSelector = labelSelector.String() 31 | return repo.WatchPrometheusServiceLevels(context.Background(), ns, options) 32 | }, 33 | }) 34 | } 35 | -------------------------------------------------------------------------------- /internal/info/info.go: -------------------------------------------------------------------------------- 1 | package info 2 | 3 | var ( 4 | // Version is the version app. 5 | Version = "dev" 6 | ) 7 | 8 | type Mode string 9 | 10 | const ( 11 | ModeTest = "test" 12 | ModeCLIGenPrometheus = "cli-gen-prom" 13 | ModeCLIGenKubernetes = "cli-gen-k8s" 14 | ModeCLIGenOpenSLO = "cli-gen-openslo" 15 | ModeControllerGenKubernetes = "ctrl-gen-k8s" 16 | ) 17 | 18 | // Info is the information of the app and request based for SLO generators. 19 | type Info struct { 20 | Version string 21 | Mode Mode 22 | Spec string 23 | } 24 | -------------------------------------------------------------------------------- /internal/k8sprometheus/helpers.go: -------------------------------------------------------------------------------- 1 | package k8sprometheus 2 | 3 | func mergeLabels(ms ...map[string]string) map[string]string { 4 | res := map[string]string{} 5 | for _, m := range ms { 6 | for k, v := range m { 7 | res[k] = v 8 | } 9 | } 10 | 11 | return res 12 | } 13 | -------------------------------------------------------------------------------- /internal/k8sprometheus/k8sprometheusmock/prometheus_rules_ensurer.go: -------------------------------------------------------------------------------- 1 | // Code generated by mockery v2.46.3. DO NOT EDIT. 2 | 3 | package k8sprometheusmock 4 | 5 | import ( 6 | context "context" 7 | 8 | mock "github.com/stretchr/testify/mock" 9 | 10 | v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" 11 | ) 12 | 13 | // PrometheusRulesEnsurer is an autogenerated mock type for the PrometheusRulesEnsurer type 14 | type PrometheusRulesEnsurer struct { 15 | mock.Mock 16 | } 17 | 18 | // EnsurePrometheusRule provides a mock function with given fields: ctx, pr 19 | func (_m *PrometheusRulesEnsurer) EnsurePrometheusRule(ctx context.Context, pr *v1.PrometheusRule) error { 20 | ret := _m.Called(ctx, pr) 21 | 22 | if len(ret) == 0 { 23 | panic("no return value specified for EnsurePrometheusRule") 24 | } 25 | 26 | var r0 error 27 | if rf, ok := ret.Get(0).(func(context.Context, *v1.PrometheusRule) error); ok { 28 | r0 = rf(ctx, pr) 29 | } else { 30 | r0 = ret.Error(0) 31 | } 32 | 33 | return r0 34 | } 35 | 36 | // NewPrometheusRulesEnsurer creates a new instance of PrometheusRulesEnsurer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. 37 | // The first argument is typically a *testing.T value. 38 | func NewPrometheusRulesEnsurer(t interface { 39 | mock.TestingT 40 | Cleanup(func()) 41 | }) *PrometheusRulesEnsurer { 42 | mock := &PrometheusRulesEnsurer{} 43 | mock.Mock.Test(t) 44 | 45 | t.Cleanup(func() { mock.AssertExpectations(t) }) 46 | 47 | return mock 48 | } 49 | -------------------------------------------------------------------------------- /internal/k8sprometheus/model.go: -------------------------------------------------------------------------------- 1 | package k8sprometheus 2 | 3 | import ( 4 | "github.com/go-playground/validator/v10" 5 | 6 | "github.com/slok/sloth/internal/prometheus" 7 | ) 8 | 9 | // K8sMeta is the Kubernetes metadata simplified. 10 | type K8sMeta struct { 11 | Kind string `validate:"required"` 12 | APIVersion string `validate:"required"` 13 | Name string `validate:"required"` 14 | UID string 15 | Namespace string 16 | Annotations map[string]string 17 | Labels map[string]string 18 | } 19 | 20 | // SLOGroup is a Kubernetes SLO group. Is created based on a regular Prometheus 21 | // SLO model and Kubernetes data. 22 | type SLOGroup struct { 23 | K8sMeta K8sMeta 24 | prometheus.SLOGroup 25 | } 26 | 27 | // Validate validates the SLO. 28 | func (s SLOGroup) Validate() error { 29 | err := modelSpecValidate.Struct(s.K8sMeta) 30 | if err != nil { 31 | return err 32 | } 33 | 34 | err = s.SLOGroup.Validate() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | return nil 40 | } 41 | 42 | var modelSpecValidate = func() *validator.Validate { 43 | return validator.New() 44 | }() 45 | -------------------------------------------------------------------------------- /internal/k8sprometheus/model_test.go: -------------------------------------------------------------------------------- 1 | package k8sprometheus_test 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | 10 | "github.com/slok/sloth/internal/k8sprometheus" 11 | "github.com/slok/sloth/internal/prometheus" 12 | ) 13 | 14 | func getGoodSLOGroup() k8sprometheus.SLOGroup { 15 | return k8sprometheus.SLOGroup{ 16 | K8sMeta: k8sprometheus.K8sMeta{ 17 | Kind: "PrometheusServiceLevel", 18 | APIVersion: "sloth.slok.dev/v1", 19 | Name: "test", 20 | Namespace: "test-ns", 21 | }, 22 | SLOGroup: prometheus.SLOGroup{SLOs: []prometheus.SLO{ 23 | getGoodSLO("slo1"), 24 | getGoodSLO("slo2"), 25 | }, 26 | }, 27 | } 28 | } 29 | 30 | func getGoodSLO(name string) prometheus.SLO { 31 | return prometheus.SLO{ 32 | ID: fmt.Sprintf("%s-id", name), 33 | Name: name, 34 | Service: "test-svc", 35 | TimeWindow: 30 * 24 * time.Hour, 36 | SLI: prometheus.SLI{ 37 | Events: &prometheus.SLIEvents{ 38 | ErrorQuery: `sum(rate(grpc_server_handled_requests_count{job="myapp",code=~"Internal|Unavailable"}[{{ .window }}]))`, 39 | TotalQuery: `sum(rate(grpc_server_handled_requests_count{job="myapp"}[{{ .window }}]))`, 40 | }, 41 | }, 42 | Objective: 99.99, 43 | Labels: map[string]string{ 44 | "owner": "myteam", 45 | "category": "test", 46 | }, 47 | InfoLabels: map[string]string{ 48 | "foo": "bar", 49 | }, 50 | PageAlertMeta: prometheus.AlertMeta{ 51 | Disable: false, 52 | Name: "testAlert", 53 | Labels: map[string]string{ 54 | "tier": "1", 55 | "severity": "slack", 56 | "channel": "#a-myteam", 57 | }, 58 | Annotations: map[string]string{ 59 | "message": "This is very important.", 60 | "runbook": "http://whatever.com", 61 | }, 62 | }, 63 | TicketAlertMeta: prometheus.AlertMeta{ 64 | Disable: false, 65 | Name: "testAlert", 66 | Labels: map[string]string{ 67 | "tier": "1", 68 | "severity": "slack", 69 | "channel": "#a-not-so-important", 70 | }, 71 | Annotations: map[string]string{ 72 | "message": "This is not very important.", 73 | "runbook": "http://whatever.com", 74 | }, 75 | }, 76 | } 77 | } 78 | 79 | func TestModelValidationSpec(t *testing.T) { 80 | tests := map[string]struct { 81 | slos func() k8sprometheus.SLOGroup 82 | expErrMessage string 83 | }{ 84 | "Correct SLOs should not fail.": { 85 | slos: getGoodSLOGroup, 86 | }, 87 | 88 | "Kind is required.": { 89 | slos: func() k8sprometheus.SLOGroup { 90 | sg := getGoodSLOGroup() 91 | sg.K8sMeta.Kind = "" 92 | return sg 93 | }, 94 | expErrMessage: "Key: 'K8sMeta.Kind' Error:Field validation for 'Kind' failed on the 'required' tag", 95 | }, 96 | 97 | "APIVersion is required.": { 98 | slos: func() k8sprometheus.SLOGroup { 99 | sg := getGoodSLOGroup() 100 | sg.K8sMeta.APIVersion = "" 101 | return sg 102 | }, 103 | expErrMessage: "Key: 'K8sMeta.APIVersion' Error:Field validation for 'APIVersion' failed on the 'required' tag", 104 | }, 105 | 106 | "Name is required.": { 107 | slos: func() k8sprometheus.SLOGroup { 108 | sg := getGoodSLOGroup() 109 | sg.K8sMeta.Name = "" 110 | return sg 111 | }, 112 | expErrMessage: "Key: 'K8sMeta.Name' Error:Field validation for 'Name' failed on the 'required' tag", 113 | }, 114 | 115 | "SLO validation is execute correctly and fails if SLOs fail.": { 116 | slos: func() k8sprometheus.SLOGroup { 117 | sg := getGoodSLOGroup() 118 | sg.SLOs[0].ID = "" 119 | return sg 120 | }, 121 | expErrMessage: "Key: 'SLOGroup.SLOs[0].ID' Error:Field validation for 'ID' failed on the 'required' tag", 122 | }, 123 | } 124 | 125 | for name, test := range tests { 126 | t.Run(name, func(t *testing.T) { 127 | assert := assert.New(t) 128 | 129 | slos := test.slos() 130 | err := slos.Validate() 131 | 132 | if test.expErrMessage != "" { 133 | assert.Error(err) 134 | assert.Equal(test.expErrMessage, err.Error()) 135 | } else { 136 | assert.NoError(err) 137 | } 138 | }) 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /internal/log/log.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import "context" 4 | 5 | // Kv is a helper type for structured logging fields usage. 6 | type Kv = map[string]interface{} 7 | 8 | // Logger is the interface that the loggers used by the library will use. 9 | type Logger interface { 10 | Infof(format string, args ...interface{}) 11 | Warningf(format string, args ...interface{}) 12 | Errorf(format string, args ...interface{}) 13 | Debugf(format string, args ...interface{}) 14 | WithValues(values map[string]interface{}) Logger 15 | WithCtxValues(ctx context.Context) Logger 16 | SetValuesOnCtx(parent context.Context, values map[string]interface{}) context.Context 17 | } 18 | 19 | // Noop logger doesn't log anything. 20 | const Noop = noop(0) 21 | 22 | type noop int 23 | 24 | func (n noop) Infof(format string, args ...interface{}) {} 25 | func (n noop) Warningf(format string, args ...interface{}) {} 26 | func (n noop) Errorf(format string, args ...interface{}) {} 27 | func (n noop) Debugf(format string, args ...interface{}) {} 28 | func (n noop) WithValues(map[string]interface{}) Logger { return n } 29 | func (n noop) WithCtxValues(context.Context) Logger { return n } 30 | func (n noop) SetValuesOnCtx(parent context.Context, values Kv) context.Context { return parent } 31 | 32 | type contextKey string 33 | 34 | // contextLogValuesKey used as unique key to store log values in the context. 35 | const contextLogValuesKey = contextKey("internal-log") 36 | 37 | // CtxWithValues returns a copy of parent in which the key values passed have been 38 | // stored ready to be used using log.Logger. 39 | func CtxWithValues(parent context.Context, kv Kv) context.Context { 40 | // Maybe we have values already set. 41 | oldValues, ok := parent.Value(contextLogValuesKey).(Kv) 42 | if !ok { 43 | oldValues = Kv{} 44 | } 45 | 46 | // Copy old and received values into the new kv. 47 | newValues := Kv{} 48 | for k, v := range oldValues { 49 | newValues[k] = v 50 | } 51 | for k, v := range kv { 52 | newValues[k] = v 53 | } 54 | 55 | return context.WithValue(parent, contextLogValuesKey, newValues) 56 | } 57 | 58 | // ValuesFromCtx gets the log Key values from a context. 59 | func ValuesFromCtx(ctx context.Context) Kv { 60 | values, ok := ctx.Value(contextLogValuesKey).(Kv) 61 | if !ok { 62 | return Kv{} 63 | } 64 | 65 | return values 66 | } 67 | -------------------------------------------------------------------------------- /internal/log/logrus/logrus.go: -------------------------------------------------------------------------------- 1 | package logrus 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/sirupsen/logrus" 7 | 8 | "github.com/slok/sloth/internal/log" 9 | ) 10 | 11 | type logger struct { 12 | *logrus.Entry 13 | } 14 | 15 | // NewLogrus returns a new log.Logger for a logrus implementation. 16 | func NewLogrus(l *logrus.Entry) log.Logger { 17 | return logger{Entry: l} 18 | } 19 | 20 | func (l logger) WithValues(kv log.Kv) log.Logger { 21 | newLogger := l.Entry.WithFields(kv) 22 | return NewLogrus(newLogger) 23 | } 24 | 25 | func (l logger) WithCtxValues(ctx context.Context) log.Logger { 26 | return l.WithValues(log.ValuesFromCtx(ctx)) 27 | } 28 | 29 | func (l logger) SetValuesOnCtx(parent context.Context, values log.Kv) context.Context { 30 | return log.CtxWithValues(parent, values) 31 | } 32 | -------------------------------------------------------------------------------- /internal/prometheus/alert_rules.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "text/template" 8 | 9 | "github.com/prometheus/prometheus/model/rulefmt" 10 | 11 | "github.com/slok/sloth/internal/alert" 12 | ) 13 | 14 | // genFunc knows how to generate an SLI recording rule for a specific time window. 15 | type alertGenFunc func(slo SLO, sloAlert AlertMeta, quick, slow alert.MWMBAlert) (*rulefmt.Rule, error) 16 | 17 | type sloAlertRulesGenerator struct { 18 | alertGenFunc alertGenFunc 19 | } 20 | 21 | // SLOAlertRulesGenerator knows how to generate the SLO prometheus alert rules 22 | // from an SLO. 23 | var SLOAlertRulesGenerator = sloAlertRulesGenerator{alertGenFunc: defaultSLOAlertGenerator} 24 | 25 | func (s sloAlertRulesGenerator) GenerateSLOAlertRules(ctx context.Context, slo SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) { 26 | rules := []rulefmt.Rule{} 27 | 28 | // Generate Page alerts. 29 | if !slo.PageAlertMeta.Disable { 30 | rule, err := s.alertGenFunc(slo, slo.PageAlertMeta, alerts.PageQuick, alerts.PageSlow) 31 | if err != nil { 32 | return nil, fmt.Errorf("could not create page alert: %w", err) 33 | } 34 | 35 | rules = append(rules, *rule) 36 | } 37 | 38 | // Generate Ticket alerts. 39 | if !slo.TicketAlertMeta.Disable { 40 | rule, err := s.alertGenFunc(slo, slo.TicketAlertMeta, alerts.TicketQuick, alerts.TicketSlow) 41 | if err != nil { 42 | return nil, fmt.Errorf("could not create ticket alert: %w", err) 43 | } 44 | 45 | rules = append(rules, *rule) 46 | } 47 | 48 | return rules, nil 49 | } 50 | 51 | func defaultSLOAlertGenerator(slo SLO, sloAlert AlertMeta, quick, slow alert.MWMBAlert) (*rulefmt.Rule, error) { 52 | // Generate the filter labels based on the SLO ids. 53 | metricFilter := labelsToPromFilter(slo.GetSLOIDPromLabels()) 54 | 55 | // Render the alert template. 56 | tplData := struct { 57 | MetricFilter string 58 | ErrorBudgetRatio float64 59 | QuickShortMetric string 60 | QuickShortBurnFactor float64 61 | QuickLongMetric string 62 | QuickLongBurnFactor float64 63 | SlowShortMetric string 64 | SlowShortBurnFactor float64 65 | SlowQuickMetric string 66 | SlowQuickBurnFactor float64 67 | WindowLabel string 68 | }{ 69 | MetricFilter: metricFilter, 70 | ErrorBudgetRatio: quick.ErrorBudget / 100, // Any(quick or slow) should work because are the same. 71 | QuickShortMetric: slo.GetSLIErrorMetric(quick.ShortWindow), 72 | QuickShortBurnFactor: quick.BurnRateFactor, 73 | QuickLongMetric: slo.GetSLIErrorMetric(quick.LongWindow), 74 | QuickLongBurnFactor: quick.BurnRateFactor, 75 | SlowShortMetric: slo.GetSLIErrorMetric(slow.ShortWindow), 76 | SlowShortBurnFactor: slow.BurnRateFactor, 77 | SlowQuickMetric: slo.GetSLIErrorMetric(slow.LongWindow), 78 | SlowQuickBurnFactor: slow.BurnRateFactor, 79 | WindowLabel: sloWindowLabelName, 80 | } 81 | var expr bytes.Buffer 82 | err := mwmbAlertTpl.Execute(&expr, tplData) 83 | if err != nil { 84 | return nil, fmt.Errorf("could not render alert expression: %w", err) 85 | } 86 | 87 | // Add specific annotations. 88 | severity := quick.Severity.String() // Any(quick or slow) should work because are the same. 89 | extraAnnotations := map[string]string{ 90 | "title": fmt.Sprintf("(%s) {{$labels.%s}} {{$labels.%s}} SLO error budget burn rate is too fast.", severity, sloServiceLabelName, sloNameLabelName), 91 | "summary": fmt.Sprintf("{{$labels.%s}} {{$labels.%s}} SLO error budget burn rate is over expected.", sloServiceLabelName, sloNameLabelName), 92 | } 93 | 94 | // Add specific labels. We don't add the labels from the rules because we will 95 | // inherit on the alerts, this way we avoid warnings of overrided labels. 96 | extraLabels := map[string]string{ 97 | sloSeverityLabelName: severity, 98 | } 99 | 100 | return &rulefmt.Rule{ 101 | Alert: sloAlert.Name, 102 | Expr: expr.String(), 103 | Annotations: mergeLabels(extraAnnotations, sloAlert.Annotations), 104 | Labels: mergeLabels(extraLabels, sloAlert.Labels), 105 | }, nil 106 | } 107 | 108 | // Multiburn multiwindow alert template. 109 | var mwmbAlertTpl = template.Must(template.New("mwmbAlertTpl").Option("missingkey=error").Parse(`( 110 | max({{ .QuickShortMetric }}{{ .MetricFilter}} > ({{ .QuickShortBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }}) 111 | and 112 | max({{ .QuickLongMetric }}{{ .MetricFilter}} > ({{ .QuickLongBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }}) 113 | ) 114 | or 115 | ( 116 | max({{ .SlowShortMetric }}{{ .MetricFilter }} > ({{ .SlowShortBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }}) 117 | and 118 | max({{ .SlowQuickMetric }}{{ .MetricFilter }} > ({{ .SlowQuickBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }}) 119 | ) 120 | `)) 121 | -------------------------------------------------------------------------------- /internal/prometheus/conventions.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | const ( 4 | // Metrics. 5 | sliErrorMetricFmt = "slo:sli_error:ratio_rate%s" 6 | 7 | // Labels. 8 | sloNameLabelName = "sloth_slo" 9 | sloIDLabelName = "sloth_id" 10 | sloServiceLabelName = "sloth_service" 11 | sloWindowLabelName = "sloth_window" 12 | sloSeverityLabelName = "sloth_severity" 13 | sloVersionLabelName = "sloth_version" 14 | sloModeLabelName = "sloth_mode" 15 | sloSpecLabelName = "sloth_spec" 16 | sloObjectiveLabelName = "sloth_objective" 17 | ) 18 | -------------------------------------------------------------------------------- /internal/prometheus/helpers.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "sort" 5 | "time" 6 | 7 | prommodel "github.com/prometheus/common/model" 8 | 9 | "github.com/slok/sloth/internal/alert" 10 | ) 11 | 12 | func mergeLabels(ms ...map[string]string) map[string]string { 13 | res := map[string]string{} 14 | for _, m := range ms { 15 | for k, v := range m { 16 | res[k] = v 17 | } 18 | } 19 | 20 | return res 21 | } 22 | 23 | func labelsToPromFilter(labels map[string]string) string { 24 | metricFilters := prommodel.LabelSet{} 25 | for k, v := range labels { 26 | metricFilters[prommodel.LabelName(k)] = prommodel.LabelValue(v) 27 | } 28 | 29 | return metricFilters.String() 30 | } 31 | 32 | // Pretty simple durations for prometheus. 33 | func timeDurationToPromStr(t time.Duration) string { 34 | return prommodel.Duration(t).String() 35 | } 36 | 37 | // getAlertGroupWindows gets all the time windows from a multiwindow multiburn alert group. 38 | func getAlertGroupWindows(alerts alert.MWMBAlertGroup) []time.Duration { 39 | // Use a map to avoid duplicated windows. 40 | windows := map[string]time.Duration{ 41 | alerts.PageQuick.ShortWindow.String(): alerts.PageQuick.ShortWindow, 42 | alerts.PageQuick.LongWindow.String(): alerts.PageQuick.LongWindow, 43 | alerts.PageSlow.ShortWindow.String(): alerts.PageSlow.ShortWindow, 44 | alerts.PageSlow.LongWindow.String(): alerts.PageSlow.LongWindow, 45 | alerts.TicketQuick.ShortWindow.String(): alerts.TicketQuick.ShortWindow, 46 | alerts.TicketQuick.LongWindow.String(): alerts.TicketQuick.LongWindow, 47 | alerts.TicketSlow.ShortWindow.String(): alerts.TicketSlow.ShortWindow, 48 | alerts.TicketSlow.LongWindow.String(): alerts.TicketSlow.LongWindow, 49 | } 50 | 51 | res := make([]time.Duration, 0, len(windows)) 52 | for _, w := range windows { 53 | res = append(res, w) 54 | } 55 | sort.SliceStable(res, func(i, j int) bool { return res[i] < res[j] }) 56 | 57 | return res 58 | } 59 | -------------------------------------------------------------------------------- /internal/prometheus/prometheusmock/file_manager.go: -------------------------------------------------------------------------------- 1 | // Code generated by mockery v2.46.3. DO NOT EDIT. 2 | 3 | package prometheusmock 4 | 5 | import ( 6 | context "context" 7 | 8 | mock "github.com/stretchr/testify/mock" 9 | 10 | regexp "regexp" 11 | ) 12 | 13 | // FileManager is an autogenerated mock type for the FileManager type 14 | type FileManager struct { 15 | mock.Mock 16 | } 17 | 18 | // FindFiles provides a mock function with given fields: ctx, root, matcher 19 | func (_m *FileManager) FindFiles(ctx context.Context, root string, matcher *regexp.Regexp) ([]string, error) { 20 | ret := _m.Called(ctx, root, matcher) 21 | 22 | if len(ret) == 0 { 23 | panic("no return value specified for FindFiles") 24 | } 25 | 26 | var r0 []string 27 | var r1 error 28 | if rf, ok := ret.Get(0).(func(context.Context, string, *regexp.Regexp) ([]string, error)); ok { 29 | return rf(ctx, root, matcher) 30 | } 31 | if rf, ok := ret.Get(0).(func(context.Context, string, *regexp.Regexp) []string); ok { 32 | r0 = rf(ctx, root, matcher) 33 | } else { 34 | if ret.Get(0) != nil { 35 | r0 = ret.Get(0).([]string) 36 | } 37 | } 38 | 39 | if rf, ok := ret.Get(1).(func(context.Context, string, *regexp.Regexp) error); ok { 40 | r1 = rf(ctx, root, matcher) 41 | } else { 42 | r1 = ret.Error(1) 43 | } 44 | 45 | return r0, r1 46 | } 47 | 48 | // ReadFile provides a mock function with given fields: ctx, path 49 | func (_m *FileManager) ReadFile(ctx context.Context, path string) ([]byte, error) { 50 | ret := _m.Called(ctx, path) 51 | 52 | if len(ret) == 0 { 53 | panic("no return value specified for ReadFile") 54 | } 55 | 56 | var r0 []byte 57 | var r1 error 58 | if rf, ok := ret.Get(0).(func(context.Context, string) ([]byte, error)); ok { 59 | return rf(ctx, path) 60 | } 61 | if rf, ok := ret.Get(0).(func(context.Context, string) []byte); ok { 62 | r0 = rf(ctx, path) 63 | } else { 64 | if ret.Get(0) != nil { 65 | r0 = ret.Get(0).([]byte) 66 | } 67 | } 68 | 69 | if rf, ok := ret.Get(1).(func(context.Context, string) error); ok { 70 | r1 = rf(ctx, path) 71 | } else { 72 | r1 = ret.Error(1) 73 | } 74 | 75 | return r0, r1 76 | } 77 | 78 | // NewFileManager creates a new instance of FileManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. 79 | // The first argument is typically a *testing.T value. 80 | func NewFileManager(t interface { 81 | mock.TestingT 82 | Cleanup(func()) 83 | }) *FileManager { 84 | mock := &FileManager{} 85 | mock.Mock.Test(t) 86 | 87 | t.Cleanup(func() { mock.AssertExpectations(t) }) 88 | 89 | return mock 90 | } 91 | -------------------------------------------------------------------------------- /internal/prometheus/sli_plugin_test.go: -------------------------------------------------------------------------------- 1 | package prometheus_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/mock" 9 | "github.com/stretchr/testify/require" 10 | 11 | "github.com/slok/sloth/internal/prometheus" 12 | "github.com/slok/sloth/internal/prometheus/prometheusmock" 13 | ) 14 | 15 | func TestSLIPluginLoader(t *testing.T) { 16 | tests := map[string]struct { 17 | pluginSrc string 18 | pluginID string 19 | meta map[string]string 20 | labels map[string]string 21 | options map[string]string 22 | expPluginID string 23 | expSLIQuery string 24 | expErrLoad bool 25 | expErr bool 26 | }{ 27 | "Plugin without version should fail on load.": { 28 | pluginSrc: ` 29 | package testplugin 30 | 31 | import "context" 32 | 33 | const SLIPluginVersion = "prometheus/v1" 34 | 35 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 36 | return "test_query{}", nil 37 | } 38 | `, 39 | expErrLoad: true, 40 | }, 41 | 42 | "Basic plugin should load and return a correct SLI.": { 43 | pluginSrc: ` 44 | package testplugin 45 | 46 | import "context" 47 | 48 | const ( 49 | SLIPluginID = "test_plugin" 50 | SLIPluginVersion = "prometheus/v1" 51 | ) 52 | 53 | 54 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 55 | return "test_query{}", nil 56 | } 57 | `, 58 | expPluginID: "test_plugin", 59 | expSLIQuery: "test_query{}", 60 | }, 61 | 62 | "Plugin with meta and options should load and return a correct SLI.": { 63 | pluginSrc: ` 64 | package testplugin 65 | 66 | import "context" 67 | 68 | import "fmt" 69 | 70 | const ( 71 | SLIPluginID = "test_plugin" 72 | SLIPluginVersion = "prometheus/v1" 73 | ) 74 | 75 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 76 | return fmt.Sprintf("test_query{mk1=\"%s\",lk1=\"%s\",k1=\"%s\",k2=\"%s\"}", meta["mk1"], labels["lk1"], options["k1"], options["k2"]), nil 77 | } 78 | `, 79 | meta: map[string]string{"mk1": "mv1"}, 80 | labels: map[string]string{"lk1": "lv1"}, 81 | options: map[string]string{"k1": "v1", "k2": "v2"}, 82 | expSLIQuery: `test_query{mk1="mv1",lk1="lv1",k1="v1",k2="v2"}`, 83 | expPluginID: "test_plugin", 84 | }, 85 | 86 | "Plugin with error should return errors.": { 87 | pluginSrc: ` 88 | package testplugin 89 | 90 | import "context" 91 | 92 | import "fmt" 93 | 94 | const ( 95 | SLIPluginID = "test_plugin" 96 | SLIPluginVersion = "prometheus/v1" 97 | ) 98 | 99 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 100 | return "", fmt.Errorf("something") 101 | } 102 | `, 103 | meta: map[string]string{"mk1": "mv1"}, 104 | labels: map[string]string{"lk1": "lv1"}, 105 | options: map[string]string{"k1": "v1", "k2": "v2"}, 106 | expPluginID: "test_plugin", 107 | expErr: true, 108 | }, 109 | } 110 | 111 | for name, test := range tests { 112 | t.Run(name, func(t *testing.T) { 113 | assert := assert.New(t) 114 | require := require.New(t) 115 | 116 | // Mock the plugin files. 117 | mfm := &prometheusmock.FileManager{} 118 | mfm.On("FindFiles", mock.Anything, "./", mock.Anything).Once().Return([]string{"testplugin/test.go"}, nil) 119 | mfm.On("ReadFile", mock.Anything, "testplugin/test.go").Once().Return([]byte(test.pluginSrc), nil) 120 | 121 | // Create repository and load plugins. 122 | config := prometheus.FileSLIPluginRepoConfig{ 123 | FileManager: mfm, 124 | Paths: []string{"./"}, 125 | } 126 | repo, err := prometheus.NewFileSLIPluginRepo(config) 127 | if test.expErrLoad { 128 | assert.Error(err) 129 | return 130 | } 131 | assert.NoError(err) 132 | 133 | // Get plugin. 134 | plugin, err := repo.GetSLIPlugin(context.TODO(), test.expPluginID) 135 | require.NoError(err) 136 | 137 | // Check. 138 | assert.Equal(test.expPluginID, plugin.ID) 139 | 140 | gotSLIQuery, err := plugin.Func(context.TODO(), test.meta, test.labels, test.options) 141 | if test.expErr { 142 | assert.Error(err) 143 | } else if assert.NoError(err) { 144 | assert.Equal(test.expSLIQuery, gotSLIQuery) 145 | } 146 | }) 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /internal/prometheus/spec.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "regexp" 7 | "time" 8 | 9 | "gopkg.in/yaml.v2" 10 | 11 | prometheusv1 "github.com/slok/sloth/pkg/prometheus/api/v1" 12 | prometheuspluginv1 "github.com/slok/sloth/pkg/prometheus/plugin/v1" 13 | ) 14 | 15 | type SLIPluginRepo interface { 16 | GetSLIPlugin(ctx context.Context, id string) (*SLIPlugin, error) 17 | } 18 | 19 | // YAMLSpecLoader knows how to load YAML specs and converts them to a model. 20 | type YAMLSpecLoader struct { 21 | windowPeriod time.Duration 22 | pluginsRepo SLIPluginRepo 23 | } 24 | 25 | // NewYAMLSpecLoader returns a YAML spec loader. 26 | func NewYAMLSpecLoader(pluginsRepo SLIPluginRepo, windowPeriod time.Duration) YAMLSpecLoader { 27 | return YAMLSpecLoader{ 28 | windowPeriod: windowPeriod, 29 | pluginsRepo: pluginsRepo, 30 | } 31 | } 32 | 33 | var specTypeV1Regex = regexp.MustCompile(`(?m)^version: +['"]?prometheus\/v1['"]? *$`) 34 | 35 | func (y YAMLSpecLoader) IsSpecType(ctx context.Context, data []byte) bool { 36 | return specTypeV1Regex.Match(data) 37 | } 38 | 39 | func (y YAMLSpecLoader) LoadSpec(ctx context.Context, data []byte) (*SLOGroup, error) { 40 | if len(data) == 0 { 41 | return nil, fmt.Errorf("spec is required") 42 | } 43 | 44 | s := prometheusv1.Spec{} 45 | err := yaml.Unmarshal(data, &s) 46 | if err != nil { 47 | return nil, fmt.Errorf("could not unmarshall YAML spec correctly: %w", err) 48 | } 49 | 50 | // Check version. 51 | if s.Version != prometheusv1.Version { 52 | return nil, fmt.Errorf("invalid spec version, should be %q", prometheusv1.Version) 53 | } 54 | 55 | // Check at least we have one SLO. 56 | if len(s.SLOs) == 0 { 57 | return nil, fmt.Errorf("at least one SLO is required") 58 | } 59 | 60 | m, err := y.mapSpecToModel(ctx, s) 61 | if err != nil { 62 | return nil, fmt.Errorf("could not map to model: %w", err) 63 | } 64 | 65 | return m, nil 66 | } 67 | 68 | func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Spec) (*SLOGroup, error) { 69 | models := make([]SLO, 0, len(spec.SLOs)) 70 | for _, specSLO := range spec.SLOs { 71 | 72 | slo := SLO{ 73 | ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), 74 | RuleGroupInterval: specSLO.Interval.RuleGroupInterval, 75 | SLIErrorRulesInterval: specSLO.Interval.SLIErrorRulesInterval, 76 | MetadataRulesInterval: specSLO.Interval.MetadataRulesInterval, 77 | AlertRulesInterval: specSLO.Interval.AlertRulesInterval, 78 | Name: specSLO.Name, 79 | Description: specSLO.Description, 80 | Service: spec.Service, 81 | TimeWindow: y.windowPeriod, 82 | Objective: specSLO.Objective, 83 | Labels: mergeLabels(spec.Labels, specSLO.Labels), 84 | PageAlertMeta: AlertMeta{Disable: true}, 85 | TicketAlertMeta: AlertMeta{Disable: true}, 86 | InfoLabels: specSLO.InfoLabels, 87 | } 88 | 89 | // Set SLIs. 90 | if specSLO.SLI.Events != nil { 91 | slo.SLI.Events = &SLIEvents{ 92 | ErrorQuery: specSLO.SLI.Events.ErrorQuery, 93 | TotalQuery: specSLO.SLI.Events.TotalQuery, 94 | } 95 | } 96 | 97 | if specSLO.SLI.Raw != nil { 98 | slo.SLI.Raw = &SLIRaw{ 99 | ErrorRatioQuery: specSLO.SLI.Raw.ErrorRatioQuery, 100 | } 101 | } 102 | 103 | if specSLO.SLI.Plugin != nil { 104 | plugin, err := y.pluginsRepo.GetSLIPlugin(ctx, specSLO.SLI.Plugin.ID) 105 | if err != nil { 106 | return nil, fmt.Errorf("could not get plugin: %w", err) 107 | } 108 | 109 | meta := map[string]string{ 110 | prometheuspluginv1.SLIPluginMetaService: spec.Service, 111 | prometheuspluginv1.SLIPluginMetaSLO: specSLO.Name, 112 | prometheuspluginv1.SLIPluginMetaObjective: fmt.Sprintf("%f", specSLO.Objective), 113 | } 114 | 115 | rawQuery, err := plugin.Func(ctx, meta, spec.Labels, specSLO.SLI.Plugin.Options) 116 | if err != nil { 117 | return nil, fmt.Errorf("plugin %q execution error: %w", specSLO.SLI.Plugin.ID, err) 118 | } 119 | 120 | slo.SLI.Raw = &SLIRaw{ 121 | ErrorRatioQuery: rawQuery, 122 | } 123 | } 124 | 125 | // Set alerts. 126 | if !specSLO.Alerting.PageAlert.Disable { 127 | slo.PageAlertMeta = AlertMeta{ 128 | Name: specSLO.Alerting.Name, 129 | Labels: mergeLabels(specSLO.Alerting.Labels, specSLO.Alerting.PageAlert.Labels), 130 | Annotations: mergeLabels(specSLO.Alerting.Annotations, specSLO.Alerting.PageAlert.Annotations), 131 | } 132 | } 133 | 134 | if !specSLO.Alerting.TicketAlert.Disable { 135 | slo.TicketAlertMeta = AlertMeta{ 136 | Name: specSLO.Alerting.Name, 137 | Labels: mergeLabels(specSLO.Alerting.Labels, specSLO.Alerting.TicketAlert.Labels), 138 | Annotations: mergeLabels(specSLO.Alerting.Annotations, specSLO.Alerting.TicketAlert.Annotations), 139 | } 140 | } 141 | 142 | models = append(models, slo) 143 | } 144 | 145 | return &SLOGroup{SLOs: models}, nil 146 | } 147 | -------------------------------------------------------------------------------- /pkg/kubernetes/api/sloth/register.go: -------------------------------------------------------------------------------- 1 | package sloth 2 | 3 | const ( 4 | GroupName = "sloth.slok.dev" 5 | ) 6 | -------------------------------------------------------------------------------- /pkg/kubernetes/api/sloth/v1/doc.go: -------------------------------------------------------------------------------- 1 | // +k8s:deepcopy-gen=package 2 | // +groupName=sloth.slok.dev 3 | // +versionName=v1 4 | 5 | package v1 6 | -------------------------------------------------------------------------------- /pkg/kubernetes/api/sloth/v1/register.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 5 | "k8s.io/apimachinery/pkg/runtime" 6 | "k8s.io/apimachinery/pkg/runtime/schema" 7 | 8 | "github.com/slok/sloth/pkg/kubernetes/api/sloth" 9 | ) 10 | 11 | const ( 12 | version = "v1" 13 | ) 14 | 15 | // SchemeGroupVersion is group version used to register these objects. 16 | var SchemeGroupVersion = schema.GroupVersion{Group: sloth.GroupName, Version: version} 17 | 18 | // Kind takes an unqualified kind and returns back a Group qualified GroupKind. 19 | func Kind(kind string) schema.GroupKind { 20 | return VersionKind(kind).GroupKind() 21 | } 22 | 23 | // VersionKind takes an unqualified kind and returns back a Group qualified GroupVersionKind. 24 | func VersionKind(kind string) schema.GroupVersionKind { 25 | return SchemeGroupVersion.WithKind(kind) 26 | } 27 | 28 | // Resource takes an unqualified resource and returns a Group qualified GroupResource. 29 | func Resource(resource string) schema.GroupResource { 30 | return SchemeGroupVersion.WithResource(resource).GroupResource() 31 | } 32 | 33 | var ( 34 | // SchemeBuilder initializes a scheme builder. 35 | SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) 36 | // AddToScheme is a global function that registers this API group & version to a scheme. 37 | AddToScheme = SchemeBuilder.AddToScheme 38 | ) 39 | 40 | // Adds the list of known types to Scheme. 41 | func addKnownTypes(scheme *runtime.Scheme) error { 42 | scheme.AddKnownTypes(SchemeGroupVersion, 43 | &PrometheusServiceLevel{}, 44 | &PrometheusServiceLevelList{}, 45 | ) 46 | metav1.AddToGroupVersion(scheme, SchemeGroupVersion) 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/clientset.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package versioned 4 | 5 | import ( 6 | "fmt" 7 | "net/http" 8 | 9 | slothv1 "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1" 10 | discovery "k8s.io/client-go/discovery" 11 | rest "k8s.io/client-go/rest" 12 | flowcontrol "k8s.io/client-go/util/flowcontrol" 13 | ) 14 | 15 | type Interface interface { 16 | Discovery() discovery.DiscoveryInterface 17 | SlothV1() slothv1.SlothV1Interface 18 | } 19 | 20 | // Clientset contains the clients for groups. Each group has exactly one 21 | // version included in a Clientset. 22 | type Clientset struct { 23 | *discovery.DiscoveryClient 24 | slothV1 *slothv1.SlothV1Client 25 | } 26 | 27 | // SlothV1 retrieves the SlothV1Client 28 | func (c *Clientset) SlothV1() slothv1.SlothV1Interface { 29 | return c.slothV1 30 | } 31 | 32 | // Discovery retrieves the DiscoveryClient 33 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 34 | if c == nil { 35 | return nil 36 | } 37 | return c.DiscoveryClient 38 | } 39 | 40 | // NewForConfig creates a new Clientset for the given config. 41 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 42 | // NewForConfig will generate a rate-limiter in configShallowCopy. 43 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 44 | // where httpClient was generated with rest.HTTPClientFor(c). 45 | func NewForConfig(c *rest.Config) (*Clientset, error) { 46 | configShallowCopy := *c 47 | 48 | if configShallowCopy.UserAgent == "" { 49 | configShallowCopy.UserAgent = rest.DefaultKubernetesUserAgent() 50 | } 51 | 52 | // share the transport between all clients 53 | httpClient, err := rest.HTTPClientFor(&configShallowCopy) 54 | if err != nil { 55 | return nil, err 56 | } 57 | 58 | return NewForConfigAndClient(&configShallowCopy, httpClient) 59 | } 60 | 61 | // NewForConfigAndClient creates a new Clientset for the given config and http client. 62 | // Note the http client provided takes precedence over the configured transport values. 63 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 64 | // NewForConfigAndClient will generate a rate-limiter in configShallowCopy. 65 | func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { 66 | configShallowCopy := *c 67 | if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { 68 | if configShallowCopy.Burst <= 0 { 69 | return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0") 70 | } 71 | configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) 72 | } 73 | 74 | var cs Clientset 75 | var err error 76 | cs.slothV1, err = slothv1.NewForConfigAndClient(&configShallowCopy, httpClient) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) 82 | if err != nil { 83 | return nil, err 84 | } 85 | return &cs, nil 86 | } 87 | 88 | // NewForConfigOrDie creates a new Clientset for the given config and 89 | // panics if there is an error in the config. 90 | func NewForConfigOrDie(c *rest.Config) *Clientset { 91 | cs, err := NewForConfig(c) 92 | if err != nil { 93 | panic(err) 94 | } 95 | return cs 96 | } 97 | 98 | // New creates a new Clientset for the given RESTClient. 99 | func New(c rest.Interface) *Clientset { 100 | var cs Clientset 101 | cs.slothV1 = slothv1.New(c) 102 | 103 | cs.DiscoveryClient = discovery.NewDiscoveryClient(c) 104 | return &cs 105 | } 106 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated clientset. 4 | package versioned 5 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/fake/clientset_generated.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | clientset "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned" 7 | slothv1 "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1" 8 | fakeslothv1 "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/fake" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | "k8s.io/apimachinery/pkg/watch" 11 | "k8s.io/client-go/discovery" 12 | fakediscovery "k8s.io/client-go/discovery/fake" 13 | "k8s.io/client-go/testing" 14 | ) 15 | 16 | // NewSimpleClientset returns a clientset that will respond with the provided objects. 17 | // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, 18 | // without applying any validations and/or defaults. It shouldn't be considered a replacement 19 | // for a real clientset and is mostly useful in simple unit tests. 20 | func NewSimpleClientset(objects ...runtime.Object) *Clientset { 21 | o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) 22 | for _, obj := range objects { 23 | if err := o.Add(obj); err != nil { 24 | panic(err) 25 | } 26 | } 27 | 28 | cs := &Clientset{tracker: o} 29 | cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} 30 | cs.AddReactor("*", "*", testing.ObjectReaction(o)) 31 | cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { 32 | gvr := action.GetResource() 33 | ns := action.GetNamespace() 34 | watch, err := o.Watch(gvr, ns) 35 | if err != nil { 36 | return false, nil, err 37 | } 38 | return true, watch, nil 39 | }) 40 | 41 | return cs 42 | } 43 | 44 | // Clientset implements clientset.Interface. Meant to be embedded into a 45 | // struct to get a default implementation. This makes faking out just the method 46 | // you want to test easier. 47 | type Clientset struct { 48 | testing.Fake 49 | discovery *fakediscovery.FakeDiscovery 50 | tracker testing.ObjectTracker 51 | } 52 | 53 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 54 | return c.discovery 55 | } 56 | 57 | func (c *Clientset) Tracker() testing.ObjectTracker { 58 | return c.tracker 59 | } 60 | 61 | var ( 62 | _ clientset.Interface = &Clientset{} 63 | _ testing.FakeClient = &Clientset{} 64 | ) 65 | 66 | // SlothV1 retrieves the SlothV1Client 67 | func (c *Clientset) SlothV1() slothv1.SlothV1Interface { 68 | return &fakeslothv1.FakeSlothV1{Fake: &c.Fake} 69 | } 70 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated fake clientset. 4 | package fake 5 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/fake/register.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | slothv1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | runtime "k8s.io/apimachinery/pkg/runtime" 9 | schema "k8s.io/apimachinery/pkg/runtime/schema" 10 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 11 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 12 | ) 13 | 14 | var scheme = runtime.NewScheme() 15 | var codecs = serializer.NewCodecFactory(scheme) 16 | 17 | var localSchemeBuilder = runtime.SchemeBuilder{ 18 | slothv1.AddToScheme, 19 | } 20 | 21 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 22 | // of clientsets, like in: 23 | // 24 | // import ( 25 | // "k8s.io/client-go/kubernetes" 26 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 27 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 28 | // ) 29 | // 30 | // kclientset, _ := kubernetes.NewForConfig(c) 31 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 32 | // 33 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 34 | // correctly. 35 | var AddToScheme = localSchemeBuilder.AddToScheme 36 | 37 | func init() { 38 | v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) 39 | utilruntime.Must(AddToScheme(scheme)) 40 | } 41 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/scheme/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package contains the scheme of the automatically generated clientset. 4 | package scheme 5 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/scheme/register.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package scheme 4 | 5 | import ( 6 | slothv1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | runtime "k8s.io/apimachinery/pkg/runtime" 9 | schema "k8s.io/apimachinery/pkg/runtime/schema" 10 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 11 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 12 | ) 13 | 14 | var Scheme = runtime.NewScheme() 15 | var Codecs = serializer.NewCodecFactory(Scheme) 16 | var ParameterCodec = runtime.NewParameterCodec(Scheme) 17 | var localSchemeBuilder = runtime.SchemeBuilder{ 18 | slothv1.AddToScheme, 19 | } 20 | 21 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 22 | // of clientsets, like in: 23 | // 24 | // import ( 25 | // "k8s.io/client-go/kubernetes" 26 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 27 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 28 | // ) 29 | // 30 | // kclientset, _ := kubernetes.NewForConfig(c) 31 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 32 | // 33 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 34 | // correctly. 35 | var AddToScheme = localSchemeBuilder.AddToScheme 36 | 37 | func init() { 38 | v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) 39 | utilruntime.Must(AddToScheme(Scheme)) 40 | } 41 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated typed clients. 4 | package v1 5 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // Package fake has the automatically generated clients. 4 | package fake 5 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/fake/fake_sloth_client.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | v1 "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1" 7 | rest "k8s.io/client-go/rest" 8 | testing "k8s.io/client-go/testing" 9 | ) 10 | 11 | type FakeSlothV1 struct { 12 | *testing.Fake 13 | } 14 | 15 | func (c *FakeSlothV1) PrometheusServiceLevels(namespace string) v1.PrometheusServiceLevelInterface { 16 | return &FakePrometheusServiceLevels{c, namespace} 17 | } 18 | 19 | // RESTClient returns a RESTClient that is used to communicate 20 | // with API server by this client implementation. 21 | func (c *FakeSlothV1) RESTClient() rest.Interface { 22 | var ret *rest.RESTClient 23 | return ret 24 | } 25 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package v1 4 | 5 | type PrometheusServiceLevelExpansion interface{} 6 | -------------------------------------------------------------------------------- /pkg/kubernetes/gen/clientset/versioned/typed/sloth/v1/sloth_client.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package v1 4 | 5 | import ( 6 | "net/http" 7 | 8 | v1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" 9 | "github.com/slok/sloth/pkg/kubernetes/gen/clientset/versioned/scheme" 10 | rest "k8s.io/client-go/rest" 11 | ) 12 | 13 | type SlothV1Interface interface { 14 | RESTClient() rest.Interface 15 | PrometheusServiceLevelsGetter 16 | } 17 | 18 | // SlothV1Client is used to interact with features provided by the sloth.slok.dev group. 19 | type SlothV1Client struct { 20 | restClient rest.Interface 21 | } 22 | 23 | func (c *SlothV1Client) PrometheusServiceLevels(namespace string) PrometheusServiceLevelInterface { 24 | return newPrometheusServiceLevels(c, namespace) 25 | } 26 | 27 | // NewForConfig creates a new SlothV1Client for the given config. 28 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 29 | // where httpClient was generated with rest.HTTPClientFor(c). 30 | func NewForConfig(c *rest.Config) (*SlothV1Client, error) { 31 | config := *c 32 | if err := setConfigDefaults(&config); err != nil { 33 | return nil, err 34 | } 35 | httpClient, err := rest.HTTPClientFor(&config) 36 | if err != nil { 37 | return nil, err 38 | } 39 | return NewForConfigAndClient(&config, httpClient) 40 | } 41 | 42 | // NewForConfigAndClient creates a new SlothV1Client for the given config and http client. 43 | // Note the http client provided takes precedence over the configured transport values. 44 | func NewForConfigAndClient(c *rest.Config, h *http.Client) (*SlothV1Client, error) { 45 | config := *c 46 | if err := setConfigDefaults(&config); err != nil { 47 | return nil, err 48 | } 49 | client, err := rest.RESTClientForConfigAndClient(&config, h) 50 | if err != nil { 51 | return nil, err 52 | } 53 | return &SlothV1Client{client}, nil 54 | } 55 | 56 | // NewForConfigOrDie creates a new SlothV1Client for the given config and 57 | // panics if there is an error in the config. 58 | func NewForConfigOrDie(c *rest.Config) *SlothV1Client { 59 | client, err := NewForConfig(c) 60 | if err != nil { 61 | panic(err) 62 | } 63 | return client 64 | } 65 | 66 | // New creates a new SlothV1Client for the given RESTClient. 67 | func New(c rest.Interface) *SlothV1Client { 68 | return &SlothV1Client{c} 69 | } 70 | 71 | func setConfigDefaults(config *rest.Config) error { 72 | gv := v1.SchemeGroupVersion 73 | config.GroupVersion = &gv 74 | config.APIPath = "/apis" 75 | config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() 76 | 77 | if config.UserAgent == "" { 78 | config.UserAgent = rest.DefaultKubernetesUserAgent() 79 | } 80 | 81 | return nil 82 | } 83 | 84 | // RESTClient returns a RESTClient that is used to communicate 85 | // with API server by this client implementation. 86 | func (c *SlothV1Client) RESTClient() rest.Interface { 87 | if c == nil { 88 | return nil 89 | } 90 | return c.restClient 91 | } 92 | -------------------------------------------------------------------------------- /pkg/prometheus/alertwindows/v1/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # v1 4 | 5 | ```go 6 | import "github.com/slok/sloth/pkg/prometheus/alertwindows/v1" 7 | ``` 8 | 9 | ## Index 10 | 11 | - [Constants](<#constants>) 12 | - [type AlertWindows](<#type-alertwindows>) 13 | - [type PageWindow](<#type-pagewindow>) 14 | - [type QuickSlowWindow](<#type-quickslowwindow>) 15 | - [type Spec](<#type-spec>) 16 | - [type TicketWindow](<#type-ticketwindow>) 17 | - [type Window](<#type-window>) 18 | 19 | 20 | ## Constants 21 | 22 | ```go 23 | const APIVersion = "sloth.slok.dev/v1" 24 | ``` 25 | 26 | ```go 27 | const Kind = "AlertWindows" 28 | ``` 29 | 30 | ## type [AlertWindows]() 31 | 32 | ```go 33 | type AlertWindows struct { 34 | Kind string `yaml:"kind"` 35 | APIVersion string `yaml:"apiVersion"` 36 | Spec Spec `yaml:"spec"` 37 | } 38 | ``` 39 | 40 | ## type [PageWindow]() 41 | 42 | PageWindow represents the configuration for page alerting. 43 | 44 | ```go 45 | type PageWindow struct { 46 | QuickSlowWindow `yaml:",inline"` 47 | } 48 | ``` 49 | 50 | ## type [QuickSlowWindow]() 51 | 52 | ```go 53 | type QuickSlowWindow struct { 54 | // Quick represents the windows for the quick alerting trigger. 55 | Quick Window `yaml:"quick"` 56 | // Slow represents the windows for the slow alerting trigger. 57 | Slow Window `yaml:"slow"` 58 | } 59 | ``` 60 | 61 | ## type [Spec]() 62 | 63 | Spec represents the root type of the Alerting window. 64 | 65 | ```go 66 | type Spec struct { 67 | // SLOPeriod is the full slo period used for this windows. 68 | SLOPeriod prometheusmodel.Duration `yaml:"sloPeriod"` 69 | // Page represents the configuration for the page alerting windows. 70 | Page PageWindow `yaml:"page"` 71 | // Ticket represents the configuration for the ticket alerting windows. 72 | Ticket TicketWindow `yaml:"ticket"` 73 | } 74 | ``` 75 | 76 | ## type [TicketWindow]() 77 | 78 | PageWindow represents the configuration for ticket alerting. 79 | 80 | ```go 81 | type TicketWindow struct { 82 | QuickSlowWindow `yaml:",inline"` 83 | } 84 | ``` 85 | 86 | ## type [Window]() 87 | 88 | ```go 89 | type Window struct { 90 | // ErrorBudgetPercent is the max error budget consumption allowed in the window. 91 | ErrorBudgetPercent float64 `yaml:"errorBudgetPercent"` 92 | // Shortwindow is the window that will stop the alerts when a huge amount of 93 | // error budget has been consumed but the error has already gone. 94 | ShortWindow prometheusmodel.Duration `yaml:"shortWindow"` 95 | // Longwindow is the window used to get the error budget for all the window. 96 | LongWindow prometheusmodel.Duration `yaml:"longWindow"` 97 | } 98 | ``` 99 | 100 | 101 | 102 | Generated by [gomarkdoc]() 103 | -------------------------------------------------------------------------------- /pkg/prometheus/alertwindows/v1/v1.go: -------------------------------------------------------------------------------- 1 | // Package v1 2 | 3 | package v1 4 | 5 | import prometheusmodel "github.com/prometheus/common/model" 6 | 7 | const Kind = "AlertWindows" 8 | const APIVersion = "sloth.slok.dev/v1" 9 | 10 | //go:generate gomarkdoc -o ./README.md ./ 11 | 12 | type AlertWindows struct { 13 | Kind string `yaml:"kind"` 14 | APIVersion string `yaml:"apiVersion"` 15 | Spec Spec `yaml:"spec"` 16 | } 17 | 18 | // Spec represents the root type of the Alerting window. 19 | type Spec struct { 20 | // SLOPeriod is the full slo period used for this windows. 21 | SLOPeriod prometheusmodel.Duration `yaml:"sloPeriod"` 22 | // Page represents the configuration for the page alerting windows. 23 | Page PageWindow `yaml:"page"` 24 | // Ticket represents the configuration for the ticket alerting windows. 25 | Ticket TicketWindow `yaml:"ticket"` 26 | } 27 | 28 | // PageWindow represents the configuration for page alerting. 29 | type PageWindow struct { 30 | QuickSlowWindow `yaml:",inline"` 31 | } 32 | 33 | // PageWindow represents the configuration for ticket alerting. 34 | type TicketWindow struct { 35 | QuickSlowWindow `yaml:",inline"` 36 | } 37 | 38 | type QuickSlowWindow struct { 39 | // Quick represents the windows for the quick alerting trigger. 40 | Quick Window `yaml:"quick"` 41 | // Slow represents the windows for the slow alerting trigger. 42 | Slow Window `yaml:"slow"` 43 | } 44 | 45 | type Window struct { 46 | // ErrorBudgetPercent is the max error budget consumption allowed in the window. 47 | ErrorBudgetPercent float64 `yaml:"errorBudgetPercent"` 48 | // Shortwindow is the window that will stop the alerts when a huge amount of 49 | // error budget has been consumed but the error has already gone. 50 | ShortWindow prometheusmodel.Duration `yaml:"shortWindow"` 51 | // Longwindow is the window used to get the error budget for all the window. 52 | LongWindow prometheusmodel.Duration `yaml:"longWindow"` 53 | } 54 | -------------------------------------------------------------------------------- /pkg/prometheus/plugin/v1/v1.go: -------------------------------------------------------------------------------- 1 | // package plugin has all the API to load prometheus plugins using Yaegi. 2 | // It uses aliases and common types to easy the dynamic plugin load so we don't need 3 | // to import this package as a library (remove dependencies/external libs from plugins). 4 | // 5 | // We use map[string]string and let the plugin make the correct conversion of types because 6 | // dealing with interfaces on dynamic plugins can lead to bugs and unwanted behaviour, so we 7 | // play it safe. 8 | package plugin 9 | 10 | import "context" 11 | 12 | // Version is this plugin type version. 13 | const Version = "prometheus/v1" 14 | 15 | // SLIPluginVersion is the version of the plugin (e.g: `prometheus/v1`). 16 | type SLIPluginVersion = string 17 | 18 | // SLIPluginID is the ID of the plugin. 19 | type SLIPluginID = string 20 | 21 | // Metada keys. 22 | const ( 23 | SLIPluginMetaService = "service" 24 | SLIPluginMetaSLO = "slo" 25 | SLIPluginMetaObjective = "objective" 26 | ) 27 | 28 | // SLIPlugin knows how to generate SLIs based on data options. 29 | // 30 | // This is the type the SLI plugins need to implement. 31 | type SLIPlugin = func(ctx context.Context, meta, labels, options map[string]string) (query string, err error) 32 | -------------------------------------------------------------------------------- /scripts/build/bin/build-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | # Build all. 7 | ostypes=("Linux" "Darwin" "Windows" "ARM") 8 | for ostype in "${ostypes[@]}" 9 | do 10 | ostype="${ostype}" ./scripts/build/bin/build.sh 11 | done 12 | 13 | # Create checksums. 14 | checksums_dir="./bin" 15 | cd ${checksums_dir} && sha256sum * > ./checksums.txt 16 | -------------------------------------------------------------------------------- /scripts/build/bin/build-raw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | # Env vars that can be set. 7 | # - EXTENSION: The binary out extension. 8 | # - VERSION: Version for the binary. 9 | # - GOOS: OS compiling target 10 | # - GOARCH: Arch compiling target. 11 | # - GOARM: ARM version. 12 | 13 | version_path="github.com/slok/sloth/internal/info.Version" 14 | src=./cmd/sloth 15 | out=./bin/sloth 16 | 17 | # Prepare flags. 18 | final_out=${out}${EXTENSION:-} 19 | ldf_cmp="-s -w -extldflags '-static'" 20 | f_ver="-X ${version_path}=${VERSION:-dev}" 21 | 22 | # Build binary. 23 | echo "[*] Building binary at ${final_out} (GOOS=${GOOS:-}, GOARCH=${GOARCH:-}, GOARM=${GOARM:-}, VERSION=${VERSION:-}, EXTENSION=${EXTENSION:-})" 24 | CGO_ENABLED=0 go build -buildvcs=false -o ${final_out} --ldflags "${ldf_cmp} ${f_ver}" ${src} 25 | -------------------------------------------------------------------------------- /scripts/build/bin/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | build_script="./scripts/build/bin/build-raw.sh" 7 | ostype=${ostype:-"native"} 8 | 9 | echo "[+] Build OS type selected: ${ostype}" 10 | 11 | if [ $ostype == 'Linux' ]; then 12 | EXTENSION="-linux-amd64" GOOS="linux" GOARCH="amd64" ${build_script} 13 | elif [ $ostype == 'Darwin' ]; then 14 | EXTENSION="-darwin-amd64" GOOS="darwin" GOARCH="amd64" ${build_script} 15 | EXTENSION="-darwin-arm64" GOOS="darwin" GOARCH="arm64" ${build_script} 16 | elif [ $ostype == 'Windows' ]; then 17 | EXTENSION="-windows-amd64.exe" GOOS="windows" GOARCH="amd64" ${build_script} 18 | elif [ $ostype == 'ARM' ]; then 19 | EXTENSION="-linux-arm64" GOOS="linux" GOARCH="arm64" ${build_script} 20 | EXTENSION="-linux-arm-v7" GOOS="linux" GOARCH="arm" GOARM="7" ${build_script} 21 | else 22 | # Native. 23 | ${build_script} 24 | fi 25 | -------------------------------------------------------------------------------- /scripts/build/docker/build-image-dev.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -e 4 | 5 | 6 | [ -z "$VERSION" ] && echo "VERSION env var is required." && exit 1; 7 | [ -z "$IMAGE" ] && echo "IMAGE env var is required." && exit 1; 8 | [ -z "$DOCKER_FILE_PATH" ] && echo "DOCKER_FILE_PATH env var is required." && exit 1; 9 | 10 | # Build image. 11 | echo "Building dev image ${IMAGE}:${VERSION}..." 12 | docker build \ 13 | -t "${IMAGE}:${VERSION}" \ 14 | -f "${DOCKER_FILE_PATH}" . -------------------------------------------------------------------------------- /scripts/build/docker/build-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -e 4 | 5 | 6 | [ -z "$VERSION" ] && echo "VERSION env var is required." && exit 1; 7 | [ -z "$IMAGE" ] && echo "IMAGE env var is required." && exit 1; 8 | [ -z "$DOCKER_FILE_PATH" ] && echo "DOCKER_FILE_PATH env var is required." && exit 1; 9 | 10 | # By default use amd64 architecture. 11 | DEF_ARCH=amd64 12 | ARCH=${ARCH:-$DEF_ARCH} 13 | 14 | IMAGE_TAG_ARCH="${IMAGE}:${VERSION}-${ARCH}" 15 | 16 | # Build image. 17 | echo "Building image ${IMAGE_TAG_ARCH}..." 18 | docker build \ 19 | --build-arg VERSION="${VERSION}" \ 20 | --build-arg ARCH="${ARCH}" \ 21 | -t "${IMAGE_TAG_ARCH}" \ 22 | -f "${DOCKER_FILE_PATH}" . -------------------------------------------------------------------------------- /scripts/build/docker/build-publish-image-all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | [ -z "$VERSION" ] && echo "VERSION env var is required." && exit 1; 7 | [ -z "$IMAGE" ] && echo "IMAGE env var is required." && exit 1; 8 | 9 | # Build and publish images for all architectures. 10 | archs=("amd64" "arm64" "arm" "ppc64le" "s390x") 11 | for arch in "${archs[@]}"; do 12 | ARCH="${arch}" ./scripts/build/docker/build-image.sh 13 | ARCH="${arch}" ./scripts/build/docker/publish-image.sh 14 | done 15 | 16 | IMAGE_TAG="${IMAGE}:${VERSION}" 17 | 18 | # Create manifest to join all arch images under one virtual tag. 19 | MANIFEST="docker manifest create -a ${IMAGE_TAG}" 20 | for arch in "${archs[@]}"; do 21 | MANIFEST="${MANIFEST} ${IMAGE_TAG}-${arch}" 22 | done 23 | eval "${MANIFEST}" 24 | 25 | # Annotate each arch manifest to set which image is build for which CPU architecture. 26 | for arch in "${archs[@]}"; do 27 | docker manifest annotate --arch "${arch}" "${IMAGE_TAG}" "${IMAGE_TAG}-${arch}" 28 | done 29 | 30 | # Push virual tag metadata. 31 | docker manifest push "${IMAGE_TAG}" 32 | 33 | # Same as the regular virtual tag but for `:latest`. 34 | if [ ! -z "${TAG_IMAGE_LATEST:-}" ]; then 35 | IMAGE_TAG_LATEST="${IMAGE}:latest" 36 | 37 | # Clean latest manifest in case there is one. 38 | docker manifest rm ${IMAGE_TAG_LATEST} || true 39 | 40 | # Create manifest to join all arch images under one virtual tag. 41 | MANIFEST_LATEST="docker manifest create -a ${IMAGE_TAG_LATEST}" 42 | for arch in "${archs[@]}"; do 43 | MANIFEST_LATEST="${MANIFEST_LATEST} ${IMAGE_TAG}-${arch}" 44 | done 45 | eval "${MANIFEST_LATEST}" 46 | 47 | # Annotate each arch manifest to set which image is build for which CPU architecture. 48 | for arch in "${archs[@]}"; do 49 | docker manifest annotate --arch "${arch}" "${IMAGE_TAG_LATEST}" "${IMAGE_TAG}-${arch}" 50 | done 51 | 52 | # Push virual tag metadata. 53 | docker manifest push "${IMAGE_TAG_LATEST}" 54 | fi -------------------------------------------------------------------------------- /scripts/build/docker/publish-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -e 4 | 5 | 6 | [ -z "$VERSION" ] && echo "VERSION env var is required." && exit 1; 7 | [ -z "$IMAGE" ] && echo "IMAGE env var is required." && exit 1; 8 | 9 | DEF_ARCH=amd64 10 | ARCH=${ARCH:-$DEF_ARCH} 11 | 12 | IMAGE_TAG_ARCH="${IMAGE}:${VERSION}-${ARCH}" 13 | 14 | echo "Pushing image ${IMAGE_TAG_ARCH}..." 15 | docker push ${IMAGE_TAG_ARCH} 16 | -------------------------------------------------------------------------------- /scripts/check/check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | golangci-lint run -------------------------------------------------------------------------------- /scripts/check/helm-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | cd ./deploy/kubernetes/helm/sloth/tests 7 | go test -race -coverprofile=.test_coverage.txt $(go list ./... | grep -v /test/integration ) 8 | go tool cover -func=.test_coverage.txt | tail -n1 | awk '{print "Total test coverage: " $3}' -------------------------------------------------------------------------------- /scripts/check/integration-test-cli.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go test -race -tags='integration' -v ./test/integration/prometheus/... -------------------------------------------------------------------------------- /scripts/check/integration-test-k8s.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go test -race -tags='integration' -v ./test/integration/k8scontroller/... -------------------------------------------------------------------------------- /scripts/check/integration-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go test -race -tags='integration' -v ./test/integration/... -------------------------------------------------------------------------------- /scripts/check/unit-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go test -buildvcs=false -race -coverprofile=.test_coverage.txt $(go list ./... | grep -v /test/integration ) 7 | go tool cover -func=.test_coverage.txt | tail -n1 | awk '{print "Total test coverage: " $3}' 8 | -------------------------------------------------------------------------------- /scripts/deploygen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: ai:ts=8:sw=8:noet 3 | set -efCo pipefail 4 | export SHELLOPTS 5 | IFS=$'\t\n' 6 | 7 | command -v helm >/dev/null 2>&1 || { echo 'please install helm'; exit 1; } 8 | 9 | HELM_CHART_PATH="${HELM_CHART_PATH:-./deploy/kubernetes/helm/sloth}" 10 | [ -z "$HELM_CHART_PATH" ] && echo "HELM_CHART_PATH env is needed" && exit 1; 11 | 12 | GEN_PATH="${GEN_PATH:-./deploy/kubernetes/raw}" 13 | [ -z "$GEN_PATH" ] && echo "GEN_PATH env is needed" && exit 1; 14 | 15 | mkdir -p "${GEN_PATH}" 16 | 17 | echo "[*] Rendering chart without plugins..." 18 | rm "${GEN_PATH}/sloth.yaml" 19 | helm template sloth "${HELM_CHART_PATH}" \ 20 | --namespace "monitoring" \ 21 | --set "commonPlugins.enabled=false" > "${GEN_PATH}/sloth.yaml" 22 | 23 | echo "[*] Rendering chart with plugins..." 24 | rm "${GEN_PATH}/sloth-with-common-plugins.yaml" 25 | helm template sloth "${HELM_CHART_PATH}" \ 26 | --namespace "monitoring" > "${GEN_PATH}/sloth-with-common-plugins.yaml" -------------------------------------------------------------------------------- /scripts/deps.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go mod tidy -------------------------------------------------------------------------------- /scripts/examplesgen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: ai:ts=8:sw=8:noet 3 | set -efCo pipefail 4 | export SHELLOPTS 5 | IFS=$'\t\n' 6 | 7 | command -v go >/dev/null 2>&1 || { 8 | echo 'please install go' 9 | exit 1 10 | } 11 | 12 | SLOS_PATH="${SLOS_PATH:-./examples}" 13 | [ -z "$SLOS_PATH" ] && echo "SLOS_PATH env is needed" && exit 1 14 | 15 | GEN_PATH="${GEN_PATH:-./examples/_gen}" 16 | [ -z "$GEN_PATH" ] && echo "GEN_PATH env is needed" && exit 1 17 | 18 | mkdir -p "${GEN_PATH}" 19 | 20 | # We already know that we are building sloth for each SLO, good enough, this way we can check 21 | # the current development version. 22 | go run ./cmd/sloth/ generate -i "${SLOS_PATH}" -o "${GEN_PATH}" -p "${SLOS_PATH}" --extra-labels "cmd=examplesgen.sh" -e "_gen|windows" 23 | -------------------------------------------------------------------------------- /scripts/gogen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | go generate ./... -------------------------------------------------------------------------------- /scripts/kubegen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | IMAGE_CLI_GEN=quay.io/slok/kube-code-generator:v1.25.0 7 | IMAGE_CRD_GEN=quay.io/slok/kube-code-generator:v1.25.0 8 | ROOT_DIRECTORY=$(dirname "$(readlink -f "$0")")/../ 9 | PROJECT_PACKAGE="github.com/slok/sloth" 10 | GEN_DIRECTORY="pkg/kubernetes/gen" 11 | 12 | echo "Cleaning gen directory" 13 | rm -rf ./${GEN_DIRECTORY} 14 | 15 | echo "Generating Kubernetes CRD clients..." 16 | docker run -it --rm \ 17 | -v ${ROOT_DIRECTORY}:/go/src/${PROJECT_PACKAGE} \ 18 | -e PROJECT_PACKAGE=${PROJECT_PACKAGE} \ 19 | -e CLIENT_GENERATOR_OUT=${PROJECT_PACKAGE}/pkg/kubernetes/gen \ 20 | -e APIS_ROOT=${PROJECT_PACKAGE}/pkg/kubernetes/api \ 21 | -e GROUPS_VERSION="sloth:v1" \ 22 | -e GENERATION_TARGETS="deepcopy,client" \ 23 | ${IMAGE_CLI_GEN} 24 | 25 | echo "Generating Kubernetes CRD manifests..." 26 | docker run -it --rm \ 27 | -v ${ROOT_DIRECTORY}:/src \ 28 | -e GO_PROJECT_ROOT=/src \ 29 | -e CRD_FLAG="crd:crdVersions=v1,allowDangerousTypes=true" \ 30 | -e CRD_TYPES_PATH=/src/pkg/kubernetes/api \ 31 | -e CRD_OUT_PATH=/src/pkg/kubernetes/gen/crd \ 32 | ${IMAGE_CRD_GEN} update-crd.sh 33 | 34 | echo "Copying crd to helm chart..." 35 | rm ./deploy/kubernetes/helm/sloth/crds/* 36 | cp "${GEN_DIRECTORY}/crd"/* deploy/kubernetes/helm/sloth/crds/ 37 | -------------------------------------------------------------------------------- /test/integration/crd/prometheus-operator-crd.yaml: -------------------------------------------------------------------------------- 1 | # https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml 2 | 3 | --- 4 | apiVersion: apiextensions.k8s.io/v1 5 | kind: CustomResourceDefinition 6 | metadata: 7 | annotations: 8 | controller-gen.kubebuilder.io/version: v0.4.1 9 | creationTimestamp: null 10 | name: prometheusrules.monitoring.coreos.com 11 | spec: 12 | group: monitoring.coreos.com 13 | names: 14 | kind: PrometheusRule 15 | listKind: PrometheusRuleList 16 | plural: prometheusrules 17 | singular: prometheusrule 18 | scope: Namespaced 19 | versions: 20 | - name: v1 21 | schema: 22 | openAPIV3Schema: 23 | description: PrometheusRule defines recording and alerting rules for a Prometheus instance 24 | properties: 25 | apiVersion: 26 | description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' 27 | type: string 28 | kind: 29 | description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' 30 | type: string 31 | metadata: 32 | type: object 33 | spec: 34 | description: Specification of desired alerting rule definitions for Prometheus. 35 | properties: 36 | groups: 37 | description: Content of Prometheus rule file 38 | items: 39 | description: 'RuleGroup is a list of sequentially evaluated recording and alerting rules. Note: PartialResponseStrategy is only used by ThanosRuler and will be ignored by Prometheus instances. Valid values for this field are ''warn'' or ''abort''. More info: https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md#partial-response' 40 | properties: 41 | interval: 42 | type: string 43 | name: 44 | type: string 45 | partial_response_strategy: 46 | type: string 47 | rules: 48 | items: 49 | description: Rule describes an alerting or recording rule. 50 | properties: 51 | alert: 52 | type: string 53 | annotations: 54 | additionalProperties: 55 | type: string 56 | type: object 57 | expr: 58 | anyOf: 59 | - type: integer 60 | - type: string 61 | x-kubernetes-int-or-string: true 62 | for: 63 | type: string 64 | labels: 65 | additionalProperties: 66 | type: string 67 | type: object 68 | record: 69 | type: string 70 | required: 71 | - expr 72 | type: object 73 | type: array 74 | required: 75 | - name 76 | - rules 77 | type: object 78 | type: array 79 | type: object 80 | required: 81 | - spec 82 | type: object 83 | served: true 84 | storage: true 85 | status: 86 | acceptedNames: 87 | kind: "" 88 | plural: "" 89 | conditions: [] 90 | storedVersions: [] 91 | -------------------------------------------------------------------------------- /test/integration/k8scontroller/plugin/plugin.go: -------------------------------------------------------------------------------- 1 | package availability 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "regexp" 8 | "strings" 9 | "text/template" 10 | ) 11 | 12 | const ( 13 | SLIPluginVersion = "prometheus/v1" 14 | SLIPluginID = "integration_test" 15 | ) 16 | 17 | var tpl = template.Must(template.New("").Parse(` 18 | sum(rate(integration_test{ {{.filter}}job="{{.job}}",code=~"(5..|429)" }[{{"{{.window}}"}}])) 19 | / 20 | sum(rate(integration_test{ {{.filter}}job="{{.job}}" }[{{"{{.window}}"}}]))`)) 21 | 22 | var filterRegex = regexp.MustCompile(`([^=]+="[^=,"]+",)+`) 23 | 24 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 25 | // Get job. 26 | job, ok := options["job"] 27 | if !ok { 28 | return "", fmt.Errorf("job options is required") 29 | } 30 | 31 | // Validate labels. 32 | err := validateLabels(labels, "owner", "tier") 33 | if err != nil { 34 | return "", fmt.Errorf("invalid labels: %w", err) 35 | } 36 | 37 | // Sanitize filter. 38 | filter := options["filter"] 39 | if filter != "" { 40 | filter = strings.Trim(filter, "{}") 41 | filter = strings.Trim(filter, ",") 42 | filter = filter + "," 43 | match := filterRegex.MatchString(filter) 44 | if !match { 45 | return "", fmt.Errorf("invalid prometheus filter: %s", filter) 46 | } 47 | } 48 | 49 | // Create query. 50 | var b bytes.Buffer 51 | data := map[string]string{ 52 | "job": job, 53 | "filter": filter, 54 | } 55 | err = tpl.Execute(&b, data) 56 | if err != nil { 57 | return "", fmt.Errorf("could not execute template: %w", err) 58 | } 59 | 60 | return b.String(), nil 61 | } 62 | 63 | func validateLabels(labels map[string]string, requiredKeys ...string) error { 64 | for _, k := range requiredKeys { 65 | v, ok := labels[k] 66 | if !ok || (ok && v == "") { 67 | return fmt.Errorf("%q label is required", k) 68 | } 69 | } 70 | 71 | return nil 72 | } 73 | -------------------------------------------------------------------------------- /test/integration/k8scontroller/windows/7d.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: AlertWindows 3 | spec: 4 | sloPeriod: 7d 5 | page: 6 | quick: 7 | errorBudgetPercent: 8 8 | shortWindow: 5m 9 | longWindow: 1h 10 | slow: 11 | errorBudgetPercent: 12.5 12 | shortWindow: 30m 13 | longWindow: 6h 14 | ticket: 15 | quick: 16 | errorBudgetPercent: 20 17 | shortWindow: 2h 18 | longWindow: 24h 19 | slow: 20 | errorBudgetPercent: 42 21 | shortWindow: 6h 22 | longWindow: 72h 23 | -------------------------------------------------------------------------------- /test/integration/prometheus/generate_test.go: -------------------------------------------------------------------------------- 1 | package prometheus_test 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "os" 7 | "testing" 8 | "text/template" 9 | 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | 13 | "github.com/slok/sloth/test/integration/prometheus" 14 | "github.com/slok/sloth/test/integration/testutils" 15 | ) 16 | 17 | type expecteOutLoader struct { 18 | version string 19 | } 20 | 21 | func (e expecteOutLoader) mustLoadExp(path string) string { 22 | fileData, err := os.ReadFile(path) 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | tmpl := template.Must(template.New("").Parse(string(fileData))) 28 | 29 | data := map[string]string{"version": e.version} 30 | var b bytes.Buffer 31 | err = tmpl.Execute(&b, data) 32 | if err != nil { 33 | panic(err) 34 | } 35 | 36 | return b.String() 37 | } 38 | 39 | func TestPrometheusGenerate(t *testing.T) { 40 | // Tests config. 41 | config := prometheus.NewConfig(t) 42 | version, err := testutils.SlothVersion(context.TODO(), config.Binary) 43 | require.NoError(t, err) 44 | 45 | expectLoader := expecteOutLoader{version: version} 46 | 47 | // Tests. 48 | tests := map[string]struct { 49 | genCmdArgs string 50 | expOut string 51 | expErr bool 52 | }{ 53 | "Generate should generate the correct rules for all the SLOs.": { 54 | genCmdArgs: "--input ./testdata/in-base.yaml", 55 | expOut: expectLoader.mustLoadExp("./testdata/out-base.yaml.tpl"), 56 | }, 57 | 58 | "Generate should generate the correct rules for all the SLOs (Kubernetes).": { 59 | genCmdArgs: "--input ./testdata/in-base-k8s.yaml", 60 | expOut: expectLoader.mustLoadExp("./testdata/out-base-k8s.yaml.tpl"), 61 | }, 62 | 63 | "Generate without alerts should generate the correct recording rules for all the SLOs.": { 64 | genCmdArgs: "--input ./testdata/in-base.yaml --disable-alerts", 65 | expOut: expectLoader.mustLoadExp("./testdata/out-base-no-alerts.yaml.tpl"), 66 | }, 67 | 68 | "Generate without recordings should generate the correct alert rules for all the SLOs.": { 69 | genCmdArgs: "--input ./testdata/in-base.yaml --disable-recordings", 70 | expOut: expectLoader.mustLoadExp("./testdata/out-base-no-recordings.yaml.tpl"), 71 | }, 72 | 73 | "Generate with extra labels should generate the correct rules for all the SLOs.": { 74 | genCmdArgs: "--input ./testdata/in-base.yaml -l exk1=exv1 -l exk2=exv2", 75 | expOut: expectLoader.mustLoadExp("./testdata/out-base-extra-labels.yaml.tpl"), 76 | }, 77 | 78 | "Generate with plugins should generate the correct rules for all the SLOs.": { 79 | genCmdArgs: "--input ./testdata/in-plugin.yaml", 80 | expOut: expectLoader.mustLoadExp("./testdata/out-plugin.yaml.tpl"), 81 | }, 82 | 83 | "Generate using multifile YAML in single file should generate the correct rules for all the SLOs.": { 84 | genCmdArgs: "--input ./testdata/in-multifile.yaml", 85 | expOut: expectLoader.mustLoadExp("./testdata/out-multifile.yaml.tpl"), 86 | }, 87 | 88 | "Generate using multifile YAML in single file should generate the correct rules for all the SLOs (Kubernetes).": { 89 | genCmdArgs: "--input ./testdata/in-multifile-k8s.yaml", 90 | expOut: expectLoader.mustLoadExp("./testdata/out-multifile-k8s.yaml.tpl"), 91 | }, 92 | 93 | "Generate using OpenSLO YAML should generate Prometheus rules.": { 94 | genCmdArgs: "--input ./testdata/in-openslo.yaml", 95 | expOut: expectLoader.mustLoadExp("./testdata/out-openslo.yaml.tpl"), 96 | }, 97 | 98 | "Generate using 28 day time window should generate Prometheus rules.": { 99 | genCmdArgs: "--default-slo-period 28d --input ./testdata/in-base.yaml", 100 | expOut: expectLoader.mustLoadExp("./testdata/out-base-28d.yaml.tpl"), 101 | }, 102 | 103 | "Generate using custom 7 day time window should generate Prometheus rules.": { 104 | genCmdArgs: "--default-slo-period 7d --input ./testdata/in-base.yaml --slo-period-windows-path ./windows", 105 | expOut: expectLoader.mustLoadExp("./testdata/out-base-custom-windows-7d.yaml.tpl"), 106 | }, 107 | 108 | "Generate using invalid version should fail.": { 109 | genCmdArgs: "--input ./testdata/in-invalid-version.yaml", 110 | expErr: true, 111 | }, 112 | } 113 | 114 | for name, test := range tests { 115 | t.Run(name, func(t *testing.T) { 116 | assert := assert.New(t) 117 | 118 | // Run with context to stop on test end. 119 | ctx, cancel := context.WithCancel(context.Background()) 120 | defer cancel() 121 | out, _, err := prometheus.RunSlothGenerate(ctx, config, test.genCmdArgs) 122 | 123 | if test.expErr { 124 | assert.Error(err) 125 | } else if assert.NoError(err) { 126 | assert.Equal(test.expOut, string(out)) 127 | } 128 | }) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /test/integration/prometheus/helpers.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "testing" 9 | 10 | "github.com/slok/sloth/test/integration/testutils" 11 | ) 12 | 13 | type Config struct { 14 | Binary string 15 | } 16 | 17 | func (c *Config) defaults() error { 18 | if c.Binary == "" { 19 | c.Binary = "sloth" 20 | } 21 | 22 | _, err := exec.LookPath(c.Binary) 23 | if err != nil { 24 | return fmt.Errorf("sloth binary missing in %q: %w", c.Binary, err) 25 | } 26 | 27 | return nil 28 | } 29 | 30 | // NewIntegrationConfig prepares the configuration for integration tests, if the configuration is not ready 31 | // it will skip the test. 32 | func NewConfig(t *testing.T) Config { 33 | const ( 34 | envSlothBin = "SLOTH_INTEGRATION_BINARY" 35 | ) 36 | 37 | c := Config{ 38 | Binary: os.Getenv(envSlothBin), 39 | } 40 | 41 | err := c.defaults() 42 | if err != nil { 43 | t.Skipf("Skipping due to invalid config: %s", err) 44 | } 45 | 46 | return c 47 | } 48 | 49 | func RunSlothGenerate(ctx context.Context, config Config, cmdArgs string) (stdout, stderr []byte, err error) { 50 | env := []string{ 51 | fmt.Sprintf("SLOTH_SLI_PLUGINS_PATH=%s", "./"), 52 | } 53 | 54 | return testutils.RunSloth(ctx, env, config.Binary, fmt.Sprintf("generate %s", cmdArgs), true) 55 | } 56 | 57 | func RunSlothValidate(ctx context.Context, config Config, cmdArgs string) (stdout, stderr []byte, err error) { 58 | env := []string{ 59 | fmt.Sprintf("SLOTH_SLI_PLUGINS_PATH=%s", "./"), 60 | } 61 | 62 | return testutils.RunSloth(ctx, env, config.Binary, fmt.Sprintf("validate %s", cmdArgs), true) 63 | } 64 | -------------------------------------------------------------------------------- /test/integration/prometheus/plugin/plugin.go: -------------------------------------------------------------------------------- 1 | package availability 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "regexp" 8 | "strings" 9 | "text/template" 10 | ) 11 | 12 | const ( 13 | SLIPluginVersion = "prometheus/v1" 14 | SLIPluginID = "integration_test" 15 | ) 16 | 17 | var tpl = template.Must(template.New("").Parse(` 18 | sum(rate(integration_test{ {{.filter}}job="{{.job}}",code=~"(5..|429)" }[{{"{{.window}}"}}])) 19 | / 20 | sum(rate(integration_test{ {{.filter}}job="{{.job}}" }[{{"{{.window}}"}}]))`)) 21 | 22 | var filterRegex = regexp.MustCompile(`([^=]+="[^=,"]+",)+`) 23 | 24 | func SLIPlugin(ctx context.Context, meta, labels, options map[string]string) (string, error) { 25 | // Get job. 26 | job, ok := options["job"] 27 | if !ok { 28 | return "", fmt.Errorf("job options is required") 29 | } 30 | 31 | // Validate labels. 32 | err := validateLabels(labels, "owner", "tier") 33 | if err != nil { 34 | return "", fmt.Errorf("invalid labels: %w", err) 35 | } 36 | 37 | // Sanitize filter. 38 | filter := options["filter"] 39 | if filter != "" { 40 | filter = strings.Trim(filter, "{}") 41 | filter = strings.Trim(filter, ",") 42 | filter = filter + "," 43 | match := filterRegex.MatchString(filter) 44 | if !match { 45 | return "", fmt.Errorf("invalid prometheus filter: %s", filter) 46 | } 47 | } 48 | 49 | // Create query. 50 | var b bytes.Buffer 51 | data := map[string]string{ 52 | "job": job, 53 | "filter": filter, 54 | } 55 | err = tpl.Execute(&b, data) 56 | if err != nil { 57 | return "", fmt.Errorf("could not execute template: %w", err) 58 | } 59 | 60 | return b.String(), nil 61 | } 62 | 63 | func validateLabels(labels map[string]string, requiredKeys ...string) error { 64 | for _, k := range requiredKeys { 65 | v, ok := labels[k] 66 | if !ok || (ok && v == "") { 67 | return fmt.Errorf("%q label is required", k) 68 | } 69 | } 70 | 71 | return nil 72 | } 73 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-base-k8s.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: PrometheusServiceLevel 3 | metadata: 4 | name: svc 5 | namespace: test-ns 6 | spec: 7 | service: "svc01" 8 | labels: 9 | global01k1: global01v1 10 | slos: 11 | - name: "slo1" 12 | objective: 99.9 13 | description: "This is SLO 01." 14 | labels: 15 | global02k1: global02v1 16 | sli: 17 | events: 18 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 19 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | alerting: 21 | name: myServiceAlert 22 | labels: 23 | alert01k1: "alert01v1" 24 | annotations: 25 | alert02k1: "alert02k2" 26 | pageAlert: 27 | labels: 28 | alert03k1: "alert03v1" 29 | ticketAlert: 30 | labels: 31 | alert04k1: "alert04v1" 32 | - name: "slo02" 33 | objective: 95 34 | description: "This is SLO 02." 35 | labels: 36 | global03k1: global03v1 37 | sli: 38 | raw: 39 | errorRatioQuery: | 40 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 41 | / 42 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 43 | alerting: 44 | pageAlert: 45 | disable: true 46 | ticketAlert: 47 | disable: true 48 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-base.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | page_alert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticket_alert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 95 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-invalid-version.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v999" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | page_alert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticket_alert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 95 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-multifile-k8s.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: sloth.slok.dev/v1 3 | kind: PrometheusServiceLevel 4 | metadata: 5 | name: svc 6 | namespace: test-ns 7 | spec: 8 | service: "svc01" 9 | labels: 10 | global01k1: global01v1 11 | slos: 12 | - name: "slo1" 13 | objective: 99.9 14 | description: "This is SLO 01." 15 | labels: 16 | global02k1: global02v1 17 | sli: 18 | events: 19 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 20 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 21 | alerting: 22 | name: myServiceAlert 23 | labels: 24 | alert01k1: "alert01v1" 25 | annotations: 26 | alert02k1: "alert02k2" 27 | pageAlert: 28 | labels: 29 | alert03k1: "alert03v1" 30 | ticketAlert: 31 | labels: 32 | alert04k1: "alert04v1" 33 | - name: "slo02" 34 | objective: 95 35 | description: "This is SLO 02." 36 | labels: 37 | global03k1: global03v1 38 | sli: 39 | raw: 40 | errorRatioQuery: | 41 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 42 | / 43 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 44 | alerting: 45 | pageAlert: 46 | disable: true 47 | ticketAlert: 48 | disable: true 49 | 50 | --- 51 | apiVersion: sloth.slok.dev/v1 52 | kind: PrometheusServiceLevel 53 | metadata: 54 | name: svc-2 55 | namespace: test-ns-2 56 | spec: 57 | service: "svc02" 58 | labels: 59 | global01k1: global01v1 60 | slos: 61 | - name: "slo1" 62 | objective: 99.99 63 | description: "This is SLO 01." 64 | labels: 65 | global02k1: global02v1 66 | sli: 67 | events: 68 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 69 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 70 | alerting: 71 | name: myServiceAlert 72 | labels: 73 | alert01k1: "alert01v1" 74 | annotations: 75 | alert02k1: "alert02k2" 76 | pageAlert: 77 | labels: 78 | alert03k1: "alert03v1" 79 | ticketAlert: 80 | labels: 81 | alert04k1: "alert04v1" 82 | - name: "slo02" 83 | objective: 95 84 | description: "This is SLO 02." 85 | labels: 86 | global03k1: global03v1 87 | sli: 88 | raw: 89 | errorRatioQuery: | 90 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 91 | / 92 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 93 | alerting: 94 | pageAlert: 95 | disable: true 96 | ticketAlert: 97 | disable: true 98 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-multifile.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: "prometheus/v1" 3 | service: "svc01" 4 | labels: 5 | global01k1: global01v1 6 | slos: 7 | - name: "slo1" 8 | objective: 99.9 9 | description: "This is SLO 01." 10 | labels: 11 | global02k1: global02v1 12 | sli: 13 | events: 14 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 15 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 16 | alerting: 17 | name: myServiceAlert 18 | labels: 19 | alert01k1: "alert01v1" 20 | annotations: 21 | alert02k1: "alert02k2" 22 | page_alert: 23 | labels: 24 | alert03k1: "alert03v1" 25 | ticket_alert: 26 | labels: 27 | alert04k1: "alert04v1" 28 | - name: "slo02" 29 | objective: 95 30 | description: "This is SLO 02." 31 | labels: 32 | global03k1: global03v1 33 | sli: 34 | raw: 35 | error_ratio_query: | 36 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 37 | / 38 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 39 | alerting: 40 | page_alert: 41 | disable: true 42 | ticket_alert: 43 | disable: true 44 | 45 | --- 46 | version: "prometheus/v1" 47 | service: "svc02" 48 | labels: 49 | global01k1: global01v1 50 | slos: 51 | - name: "slo1" 52 | objective: 99.99 53 | description: "This is SLO 01." 54 | labels: 55 | global02k1: global02v1 56 | sli: 57 | events: 58 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 59 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 60 | alerting: 61 | name: myServiceAlert 62 | labels: 63 | alert01k1: "alert01v1" 64 | annotations: 65 | alert02k1: "alert02k2" 66 | page_alert: 67 | labels: 68 | alert03k1: "alert03v1" 69 | ticket_alert: 70 | labels: 71 | alert04k1: "alert04v1" 72 | - name: "slo02" 73 | objective: 95 74 | description: "This is SLO 02." 75 | labels: 76 | global03k1: global03v1 77 | sli: 78 | raw: 79 | error_ratio_query: | 80 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 81 | / 82 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 83 | alerting: 84 | page_alert: 85 | disable: true 86 | ticket_alert: 87 | disable: true 88 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-openslo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: openslo/v1alpha 2 | kind: SLO 3 | metadata: 4 | name: slo1 5 | displayName: Integration test SLO1 6 | spec: 7 | service: svc01 8 | description: "this is SLO1." 9 | budgetingMethod: Occurrences 10 | objectives: 11 | - ratioMetrics: 12 | good: 13 | source: prometheus 14 | queryType: promql 15 | query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}])) 16 | total: 17 | source: prometheus 18 | queryType: promql 19 | query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | target: 0.999 21 | timeWindows: 22 | - count: 30 23 | unit: Day 24 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/in-plugin.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | owner: myteam 5 | tier: "2" 6 | slos: 7 | - name: "slo1" 8 | objective: 99.9 9 | description: "This is SLO 01." 10 | sli: 11 | plugin: 12 | id: integration_test 13 | options: 14 | job: svc01 15 | filter: guybrush="threepwood",melee="island" 16 | alerting: 17 | page_alert: 18 | disable: true 19 | ticket_alert: 20 | disable: true 21 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/out-base-no-recordings.yaml.tpl: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | # Code generated by Sloth ({{ .version }}): https://github.com/slok/sloth. 4 | # DO NOT EDIT. 5 | 6 | groups: 7 | - name: sloth-slo-alerts-svc01-slo1 8 | rules: 9 | - alert: myServiceAlert 10 | expr: | 11 | ( 12 | max(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (14.4 * 0.0009999999999999432)) without (sloth_window) 13 | and 14 | max(slo:sli_error:ratio_rate1h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (14.4 * 0.0009999999999999432)) without (sloth_window) 15 | ) 16 | or 17 | ( 18 | max(slo:sli_error:ratio_rate30m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (6 * 0.0009999999999999432)) without (sloth_window) 19 | and 20 | max(slo:sli_error:ratio_rate6h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (6 * 0.0009999999999999432)) without (sloth_window) 21 | ) 22 | labels: 23 | alert01k1: alert01v1 24 | alert03k1: alert03v1 25 | sloth_severity: page 26 | annotations: 27 | alert02k1: alert02k2 28 | summary: '{{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget burn 29 | rate is over expected.' 30 | title: (page) {{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget 31 | burn rate is too fast. 32 | - alert: myServiceAlert 33 | expr: | 34 | ( 35 | max(slo:sli_error:ratio_rate2h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (3 * 0.0009999999999999432)) without (sloth_window) 36 | and 37 | max(slo:sli_error:ratio_rate1d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (3 * 0.0009999999999999432)) without (sloth_window) 38 | ) 39 | or 40 | ( 41 | max(slo:sli_error:ratio_rate6h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (1 * 0.0009999999999999432)) without (sloth_window) 42 | and 43 | max(slo:sli_error:ratio_rate3d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (1 * 0.0009999999999999432)) without (sloth_window) 44 | ) 45 | labels: 46 | alert01k1: alert01v1 47 | alert04k1: alert04v1 48 | sloth_severity: ticket 49 | annotations: 50 | alert02k1: alert02k2 51 | summary: '{{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget burn 52 | rate is over expected.' 53 | title: (ticket) {{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget 54 | burn rate is too fast. 55 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-aa.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 101 # BAD! 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-ab.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 101 # BAD! 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-ba.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 101 # BAD! 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-k8s.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: PrometheusServiceLevel 3 | metadata: 4 | name: svc 5 | namespace: test-ns 6 | spec: 7 | service: "" # BAD! 8 | labels: 9 | global01k1: global01v1 10 | slos: 11 | - name: "slo1" 12 | objective: 99.9 13 | description: "This is SLO 01." 14 | labels: 15 | global02k1: global02v1 16 | sli: 17 | events: 18 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 19 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | alerting: 21 | name: myServiceAlert 22 | labels: 23 | alert01k1: "alert01v1" 24 | annotations: 25 | alert02k1: "alert02k2" 26 | pageAlert: 27 | labels: 28 | alert03k1: "alert03v1" 29 | ticketAlert: 30 | labels: 31 | alert04k1: "alert04v1" 32 | - name: "slo02" 33 | objective: 95 34 | description: "This is SLO 02." 35 | labels: 36 | global03k1: global03v1 37 | sli: 38 | raw: 39 | errorRatioQuery: | 40 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 41 | / 42 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 43 | alerting: 44 | pageAlert: 45 | disable: true 46 | ticketAlert: 47 | disable: true 48 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-multi-k8s.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: sloth.slok.dev/v1 3 | kind: PrometheusServiceLevel 4 | metadata: 5 | name: svc 6 | namespace: test-ns 7 | spec: 8 | service: "svc01" 9 | labels: 10 | global01k1: global01v1 11 | slos: 12 | - name: "slo1" 13 | objective: 99.9 14 | description: "This is SLO 01." 15 | labels: 16 | global02k1: global02v1 17 | sli: 18 | events: 19 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 20 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 21 | alerting: 22 | name: "" # BAD! 23 | labels: 24 | alert01k1: "alert01v1" 25 | annotations: 26 | alert02k1: "alert02k2" 27 | pageAlert: 28 | labels: 29 | alert03k1: "alert03v1" 30 | ticketAlert: 31 | labels: 32 | alert04k1: "alert04v1" 33 | - name: "slo02" 34 | objective: 95 35 | description: "This is SLO 02." 36 | labels: 37 | global03k1: global03v1 38 | sli: 39 | raw: 40 | errorRatioQuery: | 41 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 42 | / 43 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 44 | alerting: 45 | pageAlert: 46 | disable: true 47 | ticketAlert: 48 | disable: true 49 | 50 | --- 51 | apiVersion: sloth.slok.dev/v1 52 | kind: PrometheusServiceLevel 53 | metadata: 54 | name: svc-2 55 | namespace: test-ns-2 56 | spec: 57 | service: "svc02" 58 | labels: 59 | global01k1: global01v1 60 | slos: 61 | - name: "slo1" 62 | objective: 99.99 63 | description: "This is SLO 01." 64 | labels: 65 | global02k1: global02v1 66 | sli: 67 | events: 68 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 69 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 70 | alerting: 71 | name: myServiceAlert 72 | labels: 73 | alert01k1: "alert01v1" 74 | annotations: 75 | alert02k1: "alert02k2" 76 | pageAlert: 77 | labels: 78 | alert03k1: "alert03v1" 79 | ticketAlert: 80 | labels: 81 | alert04k1: "alert04v1" 82 | - name: "slo02" 83 | objective: 95 84 | description: "This is SLO 02." 85 | labels: 86 | global03k1: global03v1 87 | sli: 88 | raw: 89 | errorRatioQuery: | 90 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 91 | / 92 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 93 | alerting: 94 | pageAlert: 95 | disable: true 96 | ticketAlert: 97 | disable: true 98 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-multi.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: "prometheus/v1" 3 | service: "svc01" 4 | labels: 5 | global01k1: global01v1 6 | slos: 7 | - name: "slo1" 8 | objective: 99.9 9 | description: "This is SLO 01." 10 | labels: 11 | global02k1: global02v1 12 | sli: 13 | events: 14 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 15 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 16 | alerting: 17 | name: myServiceAlert 18 | labels: 19 | alert01k1: "alert01v1" 20 | annotations: 21 | alert02k1: "alert02k2" 22 | page_alert: 23 | labels: 24 | alert03k1: "alert03v1" 25 | ticket_alert: 26 | labels: 27 | alert04k1: "alert04v1" 28 | - name: "slo02" 29 | objective: 95 30 | description: "This is SLO 02." 31 | labels: 32 | global03k1: global03v1 33 | sli: 34 | raw: 35 | error_ratio_query: | 36 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 37 | / 38 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 39 | alerting: 40 | page_alert: 41 | disable: true 42 | ticket_alert: 43 | disable: true 44 | 45 | --- 46 | version: "prometheus/v1" 47 | service: "svc02" 48 | labels: 49 | global01k1: global01v1 50 | slos: 51 | - name: "slo1" 52 | objective: 99.99 53 | description: "This is SLO 01." 54 | labels: 55 | global02k1: global02v1 56 | sli: 57 | events: 58 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 59 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 60 | alerting: 61 | name: myServiceAlert 62 | labels: 63 | alert01k1: "alert01v1" 64 | annotations: 65 | alert02k1: "alert02k2" 66 | page_alert: 67 | labels: 68 | alert03k1: "alert03v1" 69 | ticket_alert: 70 | labels: 71 | alert04k1: "alert04v1" 72 | - name: "slo02" 73 | objective: 95 74 | description: "This is SLO 02." 75 | labels: 76 | global03k1: global03v1 77 | sli: {} # BAD! 78 | alerting: 79 | page_alert: 80 | disable: true 81 | ticket_alert: 82 | disable: true 83 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/bad/bad-openslo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: openslo/v1alpha 2 | kind: SLO 3 | metadata: 4 | name: slo1 5 | displayName: Integration test SLO1 6 | spec: 7 | service: svc01 8 | description: "this is SLO1." 9 | budgetingMethod: Occurrences 10 | objectives: 11 | - ratioMetrics: 12 | good: 13 | source: prometheus 14 | queryType: promql 15 | query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}])) 16 | total: 17 | source: prometheus 18 | queryType: promql 19 | query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | target: 0.999 21 | timeWindows: 22 | - count: 28 # BAD! 23 | unit: Day 24 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-aa.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 95 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-ab.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 95 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-ba.yaml: -------------------------------------------------------------------------------- 1 | version: "prometheus/v1" 2 | service: "svc01" 3 | labels: 4 | global01k1: global01v1 5 | slos: 6 | - name: "slo1" 7 | objective: 99.9 8 | description: "This is SLO 01." 9 | labels: 10 | global02k1: global02v1 11 | sli: 12 | events: 13 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 14 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 15 | alerting: 16 | name: myServiceAlert 17 | labels: 18 | alert01k1: "alert01v1" 19 | annotations: 20 | alert02k1: "alert02k2" 21 | pageAlert: 22 | labels: 23 | alert03k1: "alert03v1" 24 | ticketAlert: 25 | labels: 26 | alert04k1: "alert04v1" 27 | - name: "slo02" 28 | objective: 95 29 | description: "This is SLO 02." 30 | labels: 31 | global03k1: global03v1 32 | sli: 33 | raw: 34 | error_ratio_query: | 35 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 36 | / 37 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 38 | alerting: 39 | page_alert: 40 | disable: true 41 | ticket_alert: 42 | disable: true 43 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-k8s.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: PrometheusServiceLevel 3 | metadata: 4 | name: svc 5 | namespace: test-ns 6 | spec: 7 | service: "svc01" 8 | labels: 9 | global01k1: global01v1 10 | slos: 11 | - name: "slo1" 12 | objective: 99.9 13 | description: "This is SLO 01." 14 | labels: 15 | global02k1: global02v1 16 | sli: 17 | events: 18 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 19 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | alerting: 21 | name: myServiceAlert 22 | labels: 23 | alert01k1: "alert01v1" 24 | annotations: 25 | alert02k1: "alert02k2" 26 | pageAlert: 27 | labels: 28 | alert03k1: "alert03v1" 29 | ticketAlert: 30 | labels: 31 | alert04k1: "alert04v1" 32 | - name: "slo02" 33 | objective: 95 34 | description: "This is SLO 02." 35 | labels: 36 | global03k1: global03v1 37 | sli: 38 | raw: 39 | errorRatioQuery: | 40 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 41 | / 42 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 43 | alerting: 44 | pageAlert: 45 | disable: true 46 | ticketAlert: 47 | disable: true 48 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-multi-k8s.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: sloth.slok.dev/v1 3 | kind: PrometheusServiceLevel 4 | metadata: 5 | name: svc 6 | namespace: test-ns 7 | spec: 8 | service: "svc01" 9 | labels: 10 | global01k1: global01v1 11 | slos: 12 | - name: "slo1" 13 | objective: 99.9 14 | description: "This is SLO 01." 15 | labels: 16 | global02k1: global02v1 17 | sli: 18 | events: 19 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 20 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 21 | alerting: 22 | name: myServiceAlert 23 | labels: 24 | alert01k1: "alert01v1" 25 | annotations: 26 | alert02k1: "alert02k2" 27 | pageAlert: 28 | labels: 29 | alert03k1: "alert03v1" 30 | ticketAlert: 31 | labels: 32 | alert04k1: "alert04v1" 33 | - name: "slo02" 34 | objective: 95 35 | description: "This is SLO 02." 36 | labels: 37 | global03k1: global03v1 38 | sli: 39 | raw: 40 | errorRatioQuery: | 41 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 42 | / 43 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 44 | alerting: 45 | pageAlert: 46 | disable: true 47 | ticketAlert: 48 | disable: true 49 | 50 | --- 51 | apiVersion: sloth.slok.dev/v1 52 | kind: PrometheusServiceLevel 53 | metadata: 54 | name: svc-2 55 | namespace: test-ns-2 56 | spec: 57 | service: "svc02" 58 | labels: 59 | global01k1: global01v1 60 | slos: 61 | - name: "slo1" 62 | objective: 99.99 63 | description: "This is SLO 01." 64 | labels: 65 | global02k1: global02v1 66 | sli: 67 | events: 68 | errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 69 | totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 70 | alerting: 71 | name: myServiceAlert 72 | labels: 73 | alert01k1: "alert01v1" 74 | annotations: 75 | alert02k1: "alert02k2" 76 | pageAlert: 77 | labels: 78 | alert03k1: "alert03v1" 79 | ticketAlert: 80 | labels: 81 | alert04k1: "alert04v1" 82 | - name: "slo02" 83 | objective: 95 84 | description: "This is SLO 02." 85 | labels: 86 | global03k1: global03v1 87 | sli: 88 | raw: 89 | errorRatioQuery: | 90 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 91 | / 92 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 93 | alerting: 94 | pageAlert: 95 | disable: true 96 | ticketAlert: 97 | disable: true 98 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-multi.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: "prometheus/v1" 3 | service: "svc01" 4 | labels: 5 | global01k1: global01v1 6 | slos: 7 | - name: "slo1" 8 | objective: 99.9 9 | description: "This is SLO 01." 10 | labels: 11 | global02k1: global02v1 12 | sli: 13 | events: 14 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 15 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 16 | alerting: 17 | name: myServiceAlert 18 | labels: 19 | alert01k1: "alert01v1" 20 | annotations: 21 | alert02k1: "alert02k2" 22 | page_alert: 23 | labels: 24 | alert03k1: "alert03v1" 25 | ticket_alert: 26 | labels: 27 | alert04k1: "alert04v1" 28 | - name: "slo02" 29 | objective: 95 30 | description: "This is SLO 02." 31 | labels: 32 | global03k1: global03v1 33 | sli: 34 | raw: 35 | error_ratio_query: | 36 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 37 | / 38 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 39 | alerting: 40 | page_alert: 41 | disable: true 42 | ticket_alert: 43 | disable: true 44 | 45 | --- 46 | version: "prometheus/v1" 47 | service: "svc02" 48 | labels: 49 | global01k1: global01v1 50 | slos: 51 | - name: "slo1" 52 | objective: 99.99 53 | description: "This is SLO 01." 54 | labels: 55 | global02k1: global02v1 56 | sli: 57 | events: 58 | error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 59 | total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 60 | alerting: 61 | name: myServiceAlert 62 | labels: 63 | alert01k1: "alert01v1" 64 | annotations: 65 | alert02k1: "alert02k2" 66 | page_alert: 67 | labels: 68 | alert03k1: "alert03v1" 69 | ticket_alert: 70 | labels: 71 | alert04k1: "alert04v1" 72 | - name: "slo02" 73 | objective: 95 74 | description: "This is SLO 02." 75 | labels: 76 | global03k1: global03v1 77 | sli: 78 | raw: 79 | error_ratio_query: | 80 | sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) 81 | / 82 | sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 83 | alerting: 84 | page_alert: 85 | disable: true 86 | ticket_alert: 87 | disable: true 88 | -------------------------------------------------------------------------------- /test/integration/prometheus/testdata/validate/good/good-openslo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: openslo/v1alpha 2 | kind: SLO 3 | metadata: 4 | name: slo1 5 | displayName: Integration test SLO1 6 | spec: 7 | service: svc01 8 | description: "this is SLO1." 9 | budgetingMethod: Occurrences 10 | objectives: 11 | - ratioMetrics: 12 | good: 13 | source: prometheus 14 | queryType: promql 15 | query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}])) 16 | total: 17 | source: prometheus 18 | queryType: promql 19 | query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) 20 | target: 0.999 21 | timeWindows: 22 | - count: 30 23 | unit: Day 24 | -------------------------------------------------------------------------------- /test/integration/prometheus/validate_test.go: -------------------------------------------------------------------------------- 1 | package prometheus_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | 9 | "github.com/slok/sloth/test/integration/prometheus" 10 | ) 11 | 12 | func TestPrometheusValidate(t *testing.T) { 13 | // Tests config. 14 | config := prometheus.NewConfig(t) 15 | 16 | // Tests. 17 | tests := map[string]struct { 18 | valCmdArgs string 19 | expErr bool 20 | }{ 21 | "Discovery of good specs should validate correctly.": { 22 | valCmdArgs: "--input ./testdata/validate/good", 23 | }, 24 | 25 | "Discovery of bad specs should validate with failures.": { 26 | valCmdArgs: "--input ./testdata/validate/bad", 27 | expErr: true, 28 | }, 29 | 30 | "Discovery of all specs should validate with failures.": { 31 | valCmdArgs: "--input ./testdata/validate", 32 | expErr: true, 33 | }, 34 | 35 | "Discovery of all specs excluding bads should validate correctly.": { 36 | valCmdArgs: "--input ./testdata/validate --fs-exclude bad", 37 | }, 38 | 39 | "Discovery of all specs including only good should validate correctly.": { 40 | valCmdArgs: "--input ./testdata/validate --fs-include good", 41 | }, 42 | 43 | "Discovery of none specs should fail.": { 44 | valCmdArgs: "--input ./testdata/validate --fs-exclude .*", 45 | expErr: true, 46 | }, 47 | 48 | "Discovery of all specs excluding bad and including a bad one should validate correctly because exclude has preference.": { 49 | valCmdArgs: "--input ./testdata/validate --fs-exclude bad --fs-include .*-aa.*", 50 | }, 51 | } 52 | 53 | for name, test := range tests { 54 | t.Run(name, func(t *testing.T) { 55 | assert := assert.New(t) 56 | 57 | // Run with context to stop on test end. 58 | ctx, cancel := context.WithCancel(context.Background()) 59 | defer cancel() 60 | 61 | _, _, err := prometheus.RunSlothValidate(ctx, config, test.valCmdArgs) 62 | 63 | if test.expErr { 64 | assert.Error(err) 65 | } else { 66 | assert.NoError(err) 67 | } 68 | }) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /test/integration/prometheus/windows/7d.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: sloth.slok.dev/v1 2 | kind: AlertWindows 3 | spec: 4 | sloPeriod: 7d 5 | page: 6 | quick: 7 | errorBudgetPercent: 8 8 | shortWindow: 5m 9 | longWindow: 1h 10 | slow: 11 | errorBudgetPercent: 12.5 12 | shortWindow: 30m 13 | longWindow: 6h 14 | ticket: 15 | quick: 16 | errorBudgetPercent: 20 17 | shortWindow: 2h 18 | longWindow: 24h 19 | slow: 20 | errorBudgetPercent: 42 21 | shortWindow: 6h 22 | longWindow: 72h 23 | -------------------------------------------------------------------------------- /test/integration/testutils/cmd.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "os" 8 | "os/exec" 9 | "regexp" 10 | "strings" 11 | ) 12 | 13 | var multiSpaceRegex = regexp.MustCompile(" +") 14 | 15 | // RunSloth executes sloth command. 16 | func RunSloth(ctx context.Context, env []string, cmdApp, cmdArgs string, nolog bool) (stdout, stderr []byte, err error) { 17 | // Sanitize command. 18 | cmdArgs = strings.TrimSpace(cmdArgs) 19 | cmdArgs = multiSpaceRegex.ReplaceAllString(cmdArgs, " ") 20 | 21 | // Split into args. 22 | args := strings.Split(cmdArgs, " ") 23 | 24 | // Create command. 25 | var outData, errData bytes.Buffer 26 | cmd := exec.CommandContext(ctx, cmdApp, args...) 27 | cmd.Stdout = &outData 28 | cmd.Stderr = &errData 29 | 30 | // Set env. 31 | newEnv := append([]string{}, env...) 32 | newEnv = append(newEnv, os.Environ()...) 33 | if nolog { 34 | newEnv = append(newEnv, 35 | "SLOTH_NO_LOG=true", 36 | "SLOTH_NO_COLOR=true", 37 | ) 38 | } 39 | cmd.Env = newEnv 40 | 41 | // Run. 42 | err = cmd.Run() 43 | 44 | return outData.Bytes(), errData.Bytes(), err 45 | } 46 | 47 | func SlothVersion(ctx context.Context, slothBinary string) (string, error) { 48 | stdout, stderr, err := RunSloth(ctx, []string{}, slothBinary, "version", false) 49 | if err != nil { 50 | return "", fmt.Errorf("could not obtain versions: %s: %w", stderr, err) 51 | } 52 | 53 | version := string(stdout) 54 | version = strings.TrimSpace(version) 55 | 56 | return version, nil 57 | } 58 | --------------------------------------------------------------------------------