├── .circleci
└── config.yml
├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── .golangci.yaml
├── .goreleaser.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── cmd
└── slo_exporter.go
├── docs
├── architecture.md
├── configuration.md
├── defining_new_slo.md
├── glossary.md
├── modules
│ ├── dynamic_classifier.md
│ ├── envoy_access_log_server.md
│ ├── event_key_generator.md
│ ├── event_metadata_renamer.md
│ ├── kafka_ingester.md
│ ├── metadata_classifier.md
│ ├── prometheus_exporter.md
│ ├── prometheus_ingester.md
│ ├── relabel.md
│ ├── slo_event_producer.md
│ ├── statistical_classifier.md
│ └── tailer.md
└── operating.md
├── examples
├── README.md
├── all_in_one
│ ├── README.md
│ ├── docker-compose.yaml
│ ├── example-domain-slo-conf.yaml
│ ├── grafana
│ │ └── provisioning
│ │ │ ├── dashboards
│ │ │ ├── SLO_Effective_Burn-rate.json
│ │ │ ├── SLO_detailed.json
│ │ │ ├── SLO_domains_overview.json
│ │ │ ├── SLO_drilldown.json
│ │ │ ├── dashboard.yml
│ │ │ └── slo_exporter.json
│ │ │ └── datasources
│ │ │ └── datasource.yml
│ ├── nginx
│ │ └── conf
│ │ │ └── nginx.conf
│ ├── prometheus
│ │ ├── alerts
│ │ └── recording_rules
│ └── slo-exporter
│ │ └── conf
│ │ ├── classification.csv
│ │ ├── slo_exporter.yaml
│ │ └── slo_rules.yaml
├── envoy_proxy
│ ├── README.md
│ ├── docker-compose.yaml
│ ├── envoy
│ │ └── envoy.yaml
│ └── slo-exporter
│ │ ├── slo_exporter.yaml
│ │ └── slo_rules.yaml
├── kafka
│ ├── README.md
│ ├── docker-compose.yaml
│ └── slo-exporter
│ │ ├── slo_exporter.yaml
│ │ └── slo_rules.yaml
├── nginx_proxy
│ ├── README.md
│ ├── classification.csv
│ ├── slo_exporter.yaml
│ ├── slo_rules.yaml
│ └── test.log
└── prometheus
│ ├── README.md
│ ├── exact_events_classification.csv
│ ├── regexp_events_classification.csv
│ ├── slo_exporter.yaml
│ └── slo_rules.yaml
├── go.mod
├── go.sum
├── grafana_dashboards
├── README.md
├── SLO_Effective_Burn-rate.json
├── SLO_detailed.json
├── SLO_domains_overview.json
├── SLO_drilldown.json
└── slo_exporter.json
├── kubernetes
├── README.md
├── slo-exporter-configmap.yaml
├── slo-exporter-service.yaml
└── slo-exporter-statefulset.yaml
├── pkg
├── config
│ └── config.go
├── dynamic_classifier
│ ├── dynamic_classifier.go
│ ├── dynamic_classifier_test.go
│ ├── matcher.go
│ ├── matcher_test.go
│ ├── memory_exact_matcher.go
│ ├── regexp_matcher.go
│ └── testdata
│ │ ├── TestClassificationByExactMatches.golden
│ │ ├── TestClassificationByRegexpMatches.golden
│ │ ├── TestLoadExactMatchesFromMultipleCSV.golden
│ │ ├── TestLoadRegexpMatchesFromMultipleCSV.golden
│ │ ├── TestMatcherExactDumpCSV.golden
│ │ ├── TestMatcherRegexpDumpCSV.golden
│ │ └── Test_DynamicClassifier_Classify_OverridesCacheFromConfig.golden
├── envoy_access_log_server
│ ├── access_log_server.go
│ ├── service_v3.go
│ ├── service_v3_test.go
│ └── util.go
├── event
│ ├── raw.go
│ ├── slo.go
│ └── slo_classification.go
├── event_key_generator
│ ├── event_key_generator.go
│ └── event_key_generator_test.go
├── event_metadata_renamer
│ ├── renamer.go
│ └── renamer_test.go
├── kafka_ingester
│ ├── kafka_ingester.go
│ └── kafka_ingester_test.go
├── metadata_classifier
│ ├── metadata_cassifier.go
│ └── metadata_cassifier_test.go
├── pipeline
│ ├── manager.go
│ ├── manager_test.go
│ ├── module.go
│ └── module_test.go
├── prober
│ ├── prober.go
│ └── prober_test.go
├── prometheus_exporter
│ ├── aggregating_counter.go
│ ├── aggregating_counter_test.go
│ ├── exemplars.go
│ ├── prometheus_exporter.go
│ └── prometheus_exporter_test.go
├── prometheus_ingester
│ ├── headerRoundTripper.go
│ ├── headerRoundTripper_test.go
│ ├── prometheus_ingester.go
│ ├── prometheus_ingester_test.go
│ ├── query_executor.go
│ └── query_executor_test.go
├── relabel
│ ├── relabel.go
│ └── relabel_test.go
├── slo_event_producer
│ ├── config.go
│ ├── config_test.go
│ ├── event_evaluator.go
│ ├── event_evaluator_test.go
│ ├── operator.go
│ ├── operator_test.go
│ ├── rule.go
│ ├── rule_test.go
│ ├── slo_event_producer.go
│ └── testdata
│ │ ├── slo_rules_invalid.yaml.golden
│ │ └── slo_rules_valid.yaml.golden
├── statistical_classifier
│ ├── statistical_classifier.go
│ ├── weighted_classifier.go
│ └── weighted_classifier_test.go
├── storage
│ ├── capped_container_test.go
│ ├── container_test.go
│ ├── in_memory.go
│ └── interfaces.go
├── stringmap
│ ├── stringmap.go
│ ├── stringmap_benchmark_test.go
│ └── stringmap_test.go
└── tailer
│ ├── tailer.go
│ └── tailer_test.go
├── prometheus
├── alerts
│ ├── error-budget.yaml
│ ├── missing_all_data.yaml
│ ├── missing_data.yaml
│ ├── slo_burn_rate.yaml
│ ├── slo_data_corrections.yaml
│ └── slo_exporter_alerts.yaml
└── recording_rules
│ ├── burn-rate.yaml
│ ├── error-budget.yaml
│ ├── events-over-time-slo-exporter.yaml
│ ├── events-over-time.yaml
│ ├── example-domain.yaml
│ ├── rate-coefficient.yaml
│ └── slo_data_corrections.yaml
├── scripts
├── benchmark.sh
└── generate_godoc.sh
├── test
├── Test_MetricsInitialization
│ ├── README.md
│ ├── classifications.csv
│ ├── logs
│ ├── metrics
│ ├── slo_exporter.yaml
│ └── slo_rules.yaml
├── Test_SloHeaders
│ ├── README.md
│ ├── classifications.csv
│ ├── logs
│ ├── metrics
│ ├── slo_exporter.yaml
│ └── slo_rules.yaml
├── Test_SloHeadersUpdateCache
│ ├── README.md
│ ├── classifications.csv
│ ├── logs
│ ├── metrics
│ ├── slo_exporter.yaml
│ └── slo_rules.yaml
└── run_tests.sh
└── tools
└── slo-rules-generator
├── README.md
├── alerting.go
├── all-in-one-example-domain.yaml
├── class.go
├── domain.go
├── go.mod
├── go.sum
├── slo-domains.yaml.example
├── slo-rules-generator.go
└── slo-rules-generator_test.go
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2.1
2 | orbs:
3 | go: circleci/go@1.7.3
4 |
5 | defaults: &defaults
6 | executor:
7 | name: go/default # Use the default executor from the orb
8 | tag: "1.23"
9 |
10 | jobs:
11 | lint:
12 | <<: *defaults
13 | steps:
14 | - checkout # checkout source code
15 | - go/load-cache # Load cached Go modules.
16 | - run: GOMAXPROCS=1 GOMEMLIMIT=1750MiB make lint
17 | - go/save-cache # Save Go modules to cache.
18 |
19 | test:
20 | <<: *defaults
21 | steps:
22 | - checkout # checkout source code
23 | - go/load-cache # Load cached Go modules.
24 | - run: make test-and-coverage
25 | - go/save-cache # Save Go modules to cache.
26 |
27 | test-release:
28 | <<: *defaults
29 | steps:
30 | - checkout
31 | - setup_remote_docker
32 | - run: make test-release
33 |
34 | build:
35 | <<: *defaults
36 | steps:
37 | - checkout # checkout source code
38 | - go/load-cache # Load cached Go modules.
39 | - run: make build
40 | - go/save-cache # Save Go modules to cache.
41 | - persist_to_workspace:
42 | root: .
43 | paths:
44 | - slo_exporter
45 |
46 | build_docker:
47 | <<: *defaults
48 | steps:
49 | - checkout
50 | - setup_remote_docker
51 | - attach_workspace:
52 | at: .
53 | - run: make docker
54 |
55 | release:
56 | <<: *defaults
57 | steps:
58 | - checkout # checkout source code
59 | - go/load-cache # Load cached Go modules.
60 | - setup_remote_docker
61 | - run: |
62 | echo "${DOCKERHUB_PASSWORD}" | docker login -u="${DOCKERHUB_USER}" --password-stdin
63 | make release
64 |
65 | workflows:
66 | version: 2
67 | slo-exporter:
68 | jobs:
69 | - lint
70 | - test
71 | - test-release:
72 | filters:
73 | branches:
74 | only: master
75 | - build:
76 | filters:
77 | tags:
78 | ignore: /.*/
79 | - build_docker:
80 | requires:
81 | - build
82 | filters:
83 | tags:
84 | ignore: /.*/
85 | - release:
86 | filters:
87 | tags:
88 | only: /^v[0-9]+(\.[0-9]+){2}(-.+|[^-.]*)$/
89 | branches:
90 | ignore: /.*/
91 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve slo-exporter
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please read this before submitting a bug report
11 |
12 | * **Check the [debugging guide](docs/operating.md).** You might be able to find the cause of the problem and fix things yourself. Most importantly, check if you can reproduce the problem in the latest version of slo-exporter.
13 | * **Perform a [cursory search](https://github.com/search?q=+is%3Aissue+repo%3Aslo-exporter)** to see if the problem has already been reported. If it has **and the issue is still open**, add a comment to the existing issue instead of opening a new one.
14 |
15 | #### How Do I Submit A (Good) Bug Report?
16 |
17 | Explain the problem and include additional details to help maintainers reproduce the problem:
18 |
19 | * **Use a clear and descriptive title** for the issue to identify the problem.
20 | * **Describe the exact steps which reproduce the problem** in as many details as possible. For example, start by explaining how you started slo-exporter, e.g. which command exactly you used in the terminal. When listing steps, **don't just say what you did, but explain how you did it**.
21 | * **Provide specific examples to demonstrate the steps**. Include links to files or GitHub projects, or copy/pasteable snippets, which you use in those examples. If you're providing snippets in the issue, use [Markdown code blocks](https://help.github.com/articles/markdown-basics/#multiple-lines).
22 | * **Describe the behavior you observed after following the steps** and point out what exactly is the problem with that behavior.
23 | * **Explain which behavior you expected to see instead and why.**
24 | * **If you're reporting that slo-exporter crashed**, include a crash report with a stack trace from the operating system. Include the crash report in the issue in a [code block](https://help.github.com/articles/markdown-basics/#multiple-lines), a [file attachment](https://help.github.com/articles/file-attachments-on-issues-and-pull-requests/), or put it in a [gist](https://gist.github.com/) and provide link to that gist.
25 | * **If the problem is related to performance or memory**, include a [CPU profile capture](docs/operating.md#profiling) with your report.
26 | * **If the problem wasn't triggered by a specific action**, describe what you were doing before the problem happened and share more information using the guidelines below.
27 |
28 | Provide more context by answering these questions:
29 |
30 | * **Did the problem start happening recently** (e.g. after updating to a new version) or was this always a problem?
31 | * If the problem started happening recently, **can you reproduce the problem in an older version of slo-exporter?** What's the most recent version in which the problem doesn't happen?
32 | * **Can you reliably reproduce the issue?** If not, provide details about how often the problem happens and under which conditions it normally happens.
33 |
34 | Include details about your configuration and environment:
35 |
36 | * **Which version are you using?** You can get the exact version by running `slo-exporter --version` in your terminal.
37 | * **What's the name and version of the OS you're using**?
38 | * **Are you running slo-exporter in a virtual machine or container?** If so, which VM software are you using and which operating systems and versions are used for the host and the guest?
39 | * **What are your [local configuration files](docs/configuration.md) and environment variables?** `slo_exporter.yaml` and possibly others.
40 |
41 | ---
42 |
43 | #### Describe the bug
44 | FILL ME
45 |
46 | #### How to reproduce the bug
47 | FILL ME
48 |
49 | #### Expected behavior
50 | A clear and concise description of what you expected to happen.
51 |
52 | #### Additional context
53 | FILL ME
54 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[ENHANCEMENT]"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please read this before submitting a feature request.
11 |
12 | #### Before Submitting An Enhancement Suggestion
13 |
14 | * **Check the [debugging guide](/docs/operating.md)** for tips — you might discover that the enhancement is already available. Most importantly, check if you're using the latest version and if you can get the desired behavior by changing [configuration settings](/docs/configuration.md).
15 | * **Perform a [cursory search](https://github.com/search?q=+is%3Aissue+repo%3Aslo-exporter)** to see if the enhancement has already been suggested. If it has, add a comment to the existing issue instead of opening a new one.
16 |
17 | #### How Do I Submit A (Good) Enhancement Suggestion?
18 |
19 | Enhancement suggestions are tracked as [GitHub issues](https://guides.github.com/features/issues/). Create an issue on that repository and provide the following information:
20 |
21 | * **Use a clear and descriptive title** for the issue to identify the suggestion.
22 | * **Provide a step-by-step description of the suggested enhancement** in as many details as possible.
23 | * **Provide specific examples to demonstrate the steps**. Include copy/pasteable snippets which you use in those examples, as [Markdown code blocks](https://help.github.com/articles/markdown-basics/#multiple-lines).
24 | * **Describe the current behavior** and **explain which behavior you expected to see instead** and why.
25 | * **Explain why this enhancement would be useful** to most of users.
26 | * **Specify which version you're using.** You can get the exact version by running `slo-exporter --version` in your terminal.
27 | * **Specify the name and version of the OS you're using.**
28 |
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 |
3 | slo_exporter
4 | dist
5 | tmp
6 |
7 | **/*.pos
8 | **/test_output/
9 |
--------------------------------------------------------------------------------
/.golangci.yaml:
--------------------------------------------------------------------------------
1 | linters:
2 | enable:
3 | - contextcheck
4 | - durationcheck
5 | - errcheck
6 | - errname
7 | - errorlint
8 | - gocritic
9 | - gofmt
10 | - gofumpt
11 | - goimports
12 | - gosimple
13 | - govet
14 | - ineffassign
15 | - misspell
16 | - nakedret
17 | - nilerr
18 | - nilnil
19 | - prealloc
20 | - predeclared
21 | - promlinter
22 | - revive
23 | - staticcheck
24 | - stylecheck
25 | - typecheck
26 | - unconvert
27 | - unparam
28 | - unused
29 | - usestdlibvars
30 |
31 | linters-settings:
32 | # I'm biased and I'm enabling more than 100 checks
33 | # Might be too much for you. See https://go-critic.com/overview.html
34 | gocritic:
35 | enabled-tags:
36 | - diagnostic
37 | - experimental
38 | - opinionated
39 | - performance
40 | - style
41 | disabled-checks:
42 | # These 3 will detect many cases, but they do sense
43 | # if it's performance oriented code
44 | - hugeParam
45 | - rangeExprCopy
46 | - rangeValCopy
47 |
48 | errcheck:
49 | # Report `a := b.(MyStruct)` when `a, ok := ...` should be.
50 | check-type-assertions: true # Default: false
51 |
52 | # Report skipped checks:`num, _ := strconv.Atoi(numStr)`.
53 | check-blank: true # Default: false
54 |
55 | # Function to skip.
56 | exclude-functions:
57 | - io/ioutil.ReadFile
58 | - io.Copy(*bytes.Buffer)
59 | - io.Copy(os.Stdout)
60 |
61 | govet:
62 | disable:
63 | - fieldalignment # I'm ok to waste some bytes
64 |
65 | nakedret:
66 | max-func-lines: 1 # Default: 30
67 |
68 | issues:
69 | exclude:
70 | - "var-naming: don't use an underscore in package name"
71 | - "ST1003: should not use underscores in package names"
72 |
--------------------------------------------------------------------------------
/.goreleaser.yml:
--------------------------------------------------------------------------------
1 | before:
2 | hooks:
3 | - go mod download
4 | - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
5 |
6 | builds:
7 | - main: ./cmd/slo_exporter.go
8 | binary: slo_exporter
9 | env:
10 | - CGO_ENABLED=0
11 | goos:
12 | - linux
13 | - windows
14 | - darwin
15 | goarch:
16 | - amd64
17 | - "386"
18 | - arm64
19 |
20 | ignore:
21 | - goos: darwin
22 | goarch: "386"
23 |
24 | source:
25 | enabled: true
26 |
27 | dockers:
28 | - goos: linux
29 | goarch: amd64
30 | image_templates:
31 | - seznam/slo-exporter:{{ .Tag }}-amd64
32 | - seznam/slo-exporter:v{{ .Major }}.{{ .Minor }}-amd64
33 | - seznam/slo-exporter:v{{ .Major }}-amd64
34 | - seznam/slo-exporter:latest-amd64
35 | use: buildx
36 | build_flag_templates:
37 | - --pull
38 | # Labels according to opencontainers label schema https://github.com/opencontainers/image-spec/blob/master/annotations.md
39 | - --label=org.opencontainers.image.created={{.Date}}
40 | - --label=org.opencontainers.image.revision={{.FullCommit}}
41 | - --label=org.opencontainers.image.version={{.Version}}
42 |
43 | - --label=org.opencontainers.image.title={{.ProjectName}}
44 | - --label=org.opencontainers.image.description=Tool to evaluate and generate standardizedSLO metrics from distinct data sources.
45 | - --label=org.opencontainers.image.vendor=Seznam, a.s.
46 | - --label=org.opencontainers.image.authors=sklik.devops@firma.seznam.cz
47 | - --label=org.opencontainers.image.url={{.GitURL}}
48 | - --label=org.opencontainers.image.documentation={{.GitURL}}
49 | - "--platform=linux/amd64"
50 | - goos: linux
51 | goarch: arm64
52 | image_templates:
53 | - seznam/slo-exporter:{{ .Tag }}-arm64
54 | - seznam/slo-exporter:v{{ .Major }}.{{ .Minor }}-arm64
55 | - seznam/slo-exporter:v{{ .Major }}-arm64
56 | - seznam/slo-exporter:latest-arm64
57 | use: buildx
58 | build_flag_templates:
59 | - --pull
60 | # Labels according to opencontainers label schema https://github.com/opencontainers/image-spec/blob/master/annotations.md
61 | - --label=org.opencontainers.image.created={{.Date}}
62 | - --label=org.opencontainers.image.revision={{.FullCommit}}
63 | - --label=org.opencontainers.image.version={{.Version}}
64 |
65 | - --label=org.opencontainers.image.title={{.ProjectName}}
66 | - --label=org.opencontainers.image.description=Tool to evaluate and generate standardizedSLO metrics from distinct data sources.
67 | - --label=org.opencontainers.image.vendor=Seznam, a.s.
68 | - --label=org.opencontainers.image.authors=sklik.devops@firma.seznam.cz
69 | - --label=org.opencontainers.image.url={{.GitURL}}
70 | - --label=org.opencontainers.image.documentation={{.GitURL}}
71 | - "--platform=linux/arm64"
72 | docker_manifests:
73 | - name_template: "seznam/slo-exporter:{{ .Tag }}"
74 | image_templates:
75 | - "seznam/slo-exporter:{{ .Tag }}-amd64"
76 | - "seznam/slo-exporter:{{ .Tag }}-arm64"
77 | - name_template: "seznam/slo-exporter:v{{ .Major }}.{{ .Minor }}"
78 | image_templates:
79 | - "seznam/slo-exporter:v{{ .Major }}.{{ .Minor }}-amd64"
80 | - "seznam/slo-exporter:v{{ .Major }}.{{ .Minor }}-arm64"
81 | - name_template: "seznam/slo-exporter:v{{ .Major }}"
82 | image_templates:
83 | - "seznam/slo-exporter:v{{ .Major }}-amd64"
84 | - "seznam/slo-exporter:v{{ .Major }}-arm64"
85 | - name_template: "seznam/slo-exporter:latest"
86 | image_templates:
87 | - "seznam/slo-exporter:latest-amd64"
88 | - "seznam/slo-exporter:latest-arm64"
89 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 |
3 | In order to foster an inclusive, kind, harassment-free, and cooperative community, community enforces this code of conduct on the project.
4 |
5 | ## Summary
6 |
7 | Harassment in code and discussion or violation of physical boundaries is completely unacceptable anywhere in the project codebases, issue trackers, chatrooms, mailing lists, meetups, and other events. Violators will be warned by the core team. Repeat violations will result in being blocked or banned by the core team at or before the 3rd violation.
8 |
9 | ## In detail
10 |
11 | Harassment includes offensive verbal comments related to gender identity, gender expression, sexual orientation, disability, physical appearance, body size, race, religion, sexual images, deliberate intimidation, stalking, sustained disruption, and unwelcome sexual attention.
12 |
13 | Individuals asked to stop any harassing behavior are expected to comply immediately.
14 |
15 | Maintainers are also subject to the anti-harassment policy.
16 |
17 | If anyone engages in harassing behavior, including maintainers, we may take appropriate action, up to and including warning the offender, deletion of comments, removal from the project’s codebase and communication systems, and escalation to GitHub support.
18 |
19 | If you are being harassed, notice that someone else is being harassed, or have any other concerns, please contact a member of the core team immediately.
20 |
21 | We expect everyone to follow these rules anywhere in the project codebases, issue trackers, chatrooms, and mailing lists.
22 |
23 | Finally, don’t forget that it is human to make mistakes! We all do. Let’s work together to help each other, resolve issues, and learn from the mistakes that we will all inevitably make from time to time.
24 |
25 | ## Thanks
26 |
27 | Thanks to the [thoughtbot team](https://thoughtbot.com/).
28 |
29 | ## (Code of conduct) license
30 |
31 | To the extent possible under law, the [thoughtbot team](https://thoughtbot.com/) has waived all copyright and related or neighboring rights to thoughtbot Code of Conduct. This work is published from the United States.
32 |
33 | 
34 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | #### Table Of Contents
2 |
3 | [Code of Conduct](#code-of-conduct)
4 |
5 | [I just have a question](#i-just-have-a-question)
6 |
7 | [Your First Code Contribution](#your-first-code-contribution)
8 |
9 | [Pull Requests](#pull-requests)
10 |
11 | [Styleguides](#styleguides)
12 |
13 | ### Code of Conduct
14 | This project and everyone participating in it is governed by the [Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code.
15 |
16 |
17 | ## I just have a question
18 | Please file an issue with the `question` label or contact us via [Slack](/README.md#community).
19 |
20 | ### Your First Code Contribution
21 |
22 | Unsure where to begin contributing to slo-exporter? You can start by looking through these `good-first-issue` and `help-wanted` issues:
23 |
24 | * [Good first issues](https://github.com/seznam/slo-exporter/labels/good%20first%20issue) - issues which should only require a few lines of code, and a test or two.
25 | * [Help wanted issues](https://github.com/seznam/slo-exporter/labels/help%20wanted) - issues which should be a bit more involved than `good-first-issues`.
26 |
27 | ### Pull Requests
28 |
29 | Please follow these steps to have your contribution considered by the maintainers:
30 |
31 | 2. Follow the [styleguides](#styleguides)
32 | 3. After you submit your pull request, verify that all [status checks](https://help.github.com/articles/about-status-checks/) are passing What if the status checks are failing? If a status check is failing, and you believe that the failure is unrelated to your change, please leave a comment on the pull request explaining why you believe the failure is unrelated.
33 |
34 | While the prerequisites above must be satisfied prior to having your pull request reviewed, the reviewer(s) may ask you to complete additional design work, tests, or other changes before your pull request can be ultimately accepted.
35 |
36 | ## Styleguides
37 |
38 | ### Git Commit Messages
39 |
40 | * Use the present tense ("Add feature" not "Added feature")
41 | * Use the imperative mood ("Move cursor to..." not "Moves cursor to...")
42 | * Limit the first line to 72 characters or less
43 | * Reference issues and pull requests liberally after the first line
44 |
45 | ### Golang Styleguide
46 |
47 | Follow golang [revive](github.com/mgechev/revive) advices and make sure revive reports same or less issues.
48 |
49 | ### Documentation Styleguide
50 |
51 | * Use [Markdown](https://daringfireball.net/projects/markdown).
52 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:stable-slim
2 |
3 | RUN apt-get update && apt-get install ca-certificates -y && apt-get clean
4 |
5 | COPY slo_exporter /slo_exporter/
6 | COPY Dockerfile /
7 |
8 | WORKDIR /slo_exporter
9 |
10 | ENTRYPOINT ["/slo_exporter/slo_exporter"]
11 |
12 | CMD ["--help"]
13 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/make -f
2 | SRC_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
3 | TMP_DIR ?= $(SRC_DIR)/tmp
4 | TMP_BIN_DIR ?= $(TMP_DIR)/bin
5 |
6 | GORELEASER_VERSION ?= v2.4.4
7 |
8 | .PHONY: all
9 | all: lint test-and-coverage build test-release
10 |
11 | $(TMP_DIR):
12 | mkdir -p $(TMP_DIR)
13 |
14 | $(TMP_BIN_DIR):
15 | mkdir -p $(TMP_BIN_DIR)
16 |
17 | GORELEASER ?= $(TMP_BIN_DIR)/goreleaser
18 | $(GORELEASER): $(TMP_BIN_DIR)
19 | @echo "Downloading goreleaser version $(GORELEASER_VERSION) to $(TMP_BIN_DIR) ..."
20 | @curl -sNL "https://github.com/goreleaser/goreleaser/releases/download/$(GORELEASER_VERSION)/goreleaser_Linux_x86_64.tar.gz" | tar -xzf - -C $(TMP_BIN_DIR)
21 |
22 | RELEASE_NOTES ?= $(TMP_DIR)/release_notes
23 | $(RELEASE_NOTES): $(TMP_DIR)
24 | @echo "Generating release notes to $(RELEASE_NOTES) ..."
25 | @csplit -q -n1 --suppress-matched -f $(TMP_DIR)/release-notes-part CHANGELOG.md '/## \[\s*v.*\]/' {1}
26 | @mv $(TMP_DIR)/release-notes-part1 $(RELEASE_NOTES)
27 | @rm $(TMP_DIR)/release-notes-part*
28 |
29 | .PHONY: golangci-lint
30 | golangci-lint:
31 | @echo "Downloading golangci-lint..."
32 | go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0
33 |
34 | .PHONY: lint
35 | lint: golangci-lint
36 | golangci-lint run --timeout 10m
37 |
38 | .PHONY: lint-fix
39 | lint-fix: golangci-lint
40 | golangci-lint run --fix --timeout 10m
41 |
42 | SLO_EXPORTER_BIN ?= slo_exporter
43 | .PHONY: build
44 | build:
45 | GOOS=$(OS) GOARCH=$(ARCH) CGO_ENABLED=0 go build -o $(SLO_EXPORTER_BIN) -a $(SRC_DIR)/cmd/slo_exporter.go
46 |
47 | .PHONY: docker-build
48 | docker: build
49 | docker build -t slo_exporter .
50 |
51 | .PHONY: e2e-test
52 | e2e-test: build
53 | ./test/run_tests.sh
54 |
55 | .PHONY: test
56 | test: $(TMP_DIR)
57 | go test -v --race -coverprofile=$(TMP_DIR)/coverage.out $(shell go list $(SRC_DIR)/... | grep -v /vendor/)
58 |
59 | .PHONY: benchmark
60 | benchmark: clean
61 | ./scripts/benchmark.sh
62 |
63 | .PHONY: test-and-coverage
64 | test-and-coverage: test
65 | go tool cover -func $(TMP_DIR)/coverage.out
66 |
67 | .PHONY: cross-build
68 | cross-build: $(GORELEASER)
69 | $(GORELEASER) build --clean
70 |
71 | .PHONY: test-release
72 | test-release: $(RELEASE_NOTES) $(GORELEASER)
73 | $(GORELEASER) release --snapshot --clean --release-notes $(RELEASE_NOTES)
74 |
75 | .PHONY: release
76 | release: $(RELEASE_NOTES) $(GORELEASER)
77 | @echo "Releasing new version do GitHub and DockerHub using goreleaser..."
78 | $(GORELEASER) release --clean --release-notes $(RELEASE_NOTES)
79 |
80 | .PHONY: clean
81 | clean:
82 | rm -rf dist $(TMP_DIR) $(SLO_EXPORTER_BIN)
83 | find . -type f -name "*.pos" -prune -exec rm -f {} \;
84 | find . -type d -name "test_output" -prune -exec rm -rf {} \;
85 |
--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # Architecture
2 | SLO-exporter is written in Go and built using [the pipeline pattern](https://blog.golang.org/pipelines).
3 |
4 | The processed event is passed from one module to another to allow its modification or filtering
5 | for the final state to be reported as an SLI event.
6 |
7 | The flow of the processing pipeline can be dynamically set using configuration file, so it can be used
8 | for various use cases and event types.
9 |
10 | ### Event Types
11 | Slo-exporter differentiates between two event types:
12 |
13 | ##### Raw
14 | This is an event which came from the data source, it has metadata and quantity
15 | and you can set its event key which will be in the resulting metrics and can be used for classification of the event.
16 |
17 | ##### SLO event
18 | Final event generated from the raw event. This event has already evaluated result and classification
19 | an is then reported to output metrics.
20 |
21 | ### Module types
22 | There is set of implemented modules to be used and are divided to three basic types based on their input/output.
23 |
24 | ##### `producer`
25 | Does not read any events but produces them. These modules serve as sources of the events.
26 |
27 | ##### `ingester`
28 | Reads events but does not produce any. These modules serves for reporting the SLO metrics to some external systems.
29 |
30 | ##### `processor`
31 | Combination of `producer` and `ingester`. It reads an event and produces new or modified one.
32 |
--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 | Slo exporter itself is configured using one [base YAML file](#base-config).
3 | Path to this file is configured using the `--config-file` flag.
4 | Additional configuration files might be needed by some modules
5 | depending on their needs and if they are used in the pipeline at all.
6 |
7 | #### ENV variables
8 | Every configuration option in the base YAML file can be overridden (the configuration MUST be present in the yaml config for the ENV to be loaded, it can be even empty string) by using ENV variable.
9 | The schema of ENV variable naming is `SLO_EXPORTER_` prefix and than in uppercase any key of the YAML
10 | structure in uppercase without any underscores. Underscores are used for separating nested structures.
11 | Example: `SLO_EXPORTER_WEBSERVERLISTENADDRESS=0.0.0.0:8080` or for module configuration `SLO_EXPORTER_TAILER_TAILEDFILE=access.log`
12 |
13 | #### CMD flags
14 | ```bash
15 | $ ./slo_exporter --help-long
16 | usage: slo_exporter --config-file=CONFIG-FILE []
17 |
18 | Flags:
19 | --help Show context-sensitive help (also try --help-long and --help-man).
20 | --config-file=CONFIG-FILE SLO exporter configuration file.
21 | --log-level="info" Log level (error, warn, info, debug,trace).
22 | --log-format="text" Log format (text, json).
23 | --check-config Only check config file and exit with 0 if ok and other status code if not.
24 | ```
25 |
26 | #### Processing pipeline
27 | Slo-exporter allows to dynamically compose the pipeline structure,
28 | but there is few basic rules it needs to follow:
29 | - [`producer`](#producers) can be only at the beginning of the pipeline (meaning only single producer is allowed)
30 | - [`ingester`](#ingesters) module cannot be at the beginning of pipeline.
31 | - [`ingester`](#ingesters) module can only be linked to preceding [`producer`](architecture.md#producer) module.
32 | - Type of produced event by the preceding module must match the ingested type of the following one.
33 |
34 |
35 | ### Base config
36 | ```yaml
37 | # Address where the web interface should listen on.
38 | webServerListenAddress: "0.0.0.0:8080"
39 | # Maximum time to wait for all events to be processed after receiving SIGTERM or SIGINT.
40 | maximumGracefulShutdownDuration: "10s"
41 | # How long to wait after processing pipeline has been shutdown before stopping http server w metric serving.
42 | # Useful to make sure metrics are scraped by Prometheus. Ideally set it to Prometheus scrape interval + 1s or more.
43 | # Should be less or equal to afterPipelineShutdownDelay
44 | afterPipelineShutdownDelay: "1s"
45 |
46 | # Defines architecture of the pipeline how the event will be processed by the modules.
47 | pipeline: []
48 |
49 | # Contains configuration for distinct pipeline module.
50 | modules:
51 | :
52 | ```
53 |
54 | ### `moduleType`:
55 |
56 | ##### Producers:
57 | Only produces new events from the specified data source.
58 | - [`envoy_access_log_server`](modules/envoy_access_log_server.md)
59 | - [`tailer`](modules/tailer.md)
60 | - [`prometheusIngester`](modules/prometheus_ingester.md)
61 | - [`envoyAccessLogServer`](modules/envoy_access_log_server.md)
62 | - [`kafkaIngester`](modules/kafka_ingester.md)
63 |
64 | ##### Processors:
65 | Reads input events, does some processing based in the module type and produces modified event.
66 | - [`eventKeyGenerator`](modules/event_key_generator.md)
67 | - [`metadataClassifier`](modules/metadata_classifier.md)
68 | - [`relabel`](modules/relabel.md)
69 | - [`dynamicClassifier`](modules/dynamic_classifier.md)
70 | - [`statisticalClassifier`](modules/statistical_classifier.md)
71 | - [`sloEventProducer`](modules/slo_event_producer.md)
72 |
73 | ##### Ingesters:
74 | Only reads input events but does not produce any.
75 | - [`prometheusExporter`](modules/prometheus_exporter.md)
76 |
77 | Details how they work and their `moduleConfig` can be found in their own
78 | linked documentation in the [docs/modules](modules) folder.
79 |
80 | #### Configuration examples
81 | Actual examples of usage with full configuration can be found in the [`examples/`](examples) directory.
82 |
83 | #### Configuration testing
84 | If you want to verify that your configuration is valid, use the `--check-config` flag.
85 | Slo-exporter then just verifies if the configuration is valid and exits with status 0 if ok and 1 if not.
86 |
--------------------------------------------------------------------------------
/docs/glossary.md:
--------------------------------------------------------------------------------
1 | ## Glossary
2 | Here we describe some of the terms used through the repository. We assume that you have read chapters on SLO from Google's [SRE book](https://landing.google.com/sre/sre-book/toc/) and [SRE workbook](https://landing.google.com/sre/workbook/toc/), so the main focus here is to describe
3 |
4 | ### locality, namespace
5 | We use this labels internally to differentiate between individual K8S clusters (`locality`) and K8S namespaces (`namespace`).
6 |
7 | ### slo-domain
8 | This label groups slo-types and slo-classes into single entity which shares the same error budget policy and stakeholders. SLO domain usually contains multiple error budgets (equal to no. of slo-types * number of slo-classes for individual slo-types).
9 |
10 | ### slo-type
11 | Differentiates individual SLIs - e.g. freshness, availability, etc. Some of the SLIs may be represented by multiple slo-types, multiple percentiles for latency SLI as slo-types latency90, latency99 as an example.
12 |
13 | ### slo-class
14 | Label which enable to group events from the same slo-domain and slo-type. It may serve multiple purposes, e.g. to
15 | - group events to the same classes of importance
16 | - group events which share the same SLO thresholds
17 | - group events with similar frequency of occurrence
18 |
19 | ### event_key
20 | The last level of SLO event's grouping. Its content depends on desired level of SLO drilldown accuracy. It may contain name of RPC method, or normalized path of HTTP request together with HTTP method (e.g. `GET:/campaigns/list`). See [architecture](./architecture.md) for details on SLO event's structure.
21 |
22 | ### Error budget policy
23 | A formal document which specifies actions which are to be triggered based on current state of error budget. Stopping all rollouts and shifting developers' focus on service's stability when error budget is depleted is the most common example. See [example error budget policy as published by Google](https://landing.google.com/sre/workbook/chapters/error-budget-policy/)
--------------------------------------------------------------------------------
/docs/modules/dynamic_classifier.md:
--------------------------------------------------------------------------------
1 | # Dynamic classifier
2 |
3 | | | |
4 | |----------------|---------------------|
5 | | `moduleName` | `dynamicClassifier` |
6 | | Module type | `processor` |
7 | | Input event | `raw` |
8 | | Output event | `raw` |
9 |
10 | The SLO calculation is based on some domains and classes which group together
11 | events by their functionality but also priority or demands on their quality.
12 |
13 | This is called classification and for the SLO calculation you need to assign those events
14 | to their domains and classes. These information how to classify them
15 | can be specified using CSV files or they can come along with the event.
16 |
17 | This module checks if the incoming event isn't already classified and if it isn't, it checks
18 | the CSV file specifications if they can classify the event and adds the classification if possible.
19 |
20 | The motivation behind this is that application itself can have the classification defined in it's code.
21 | Then it just passes it along with the event (HTTP request in headers for example) and there is no need
22 | to have the classification held centrally somewhere.
23 |
24 | There is one issue, for example when generating SLO events from proxy log which proxies traffic to web
25 | server sending those classification along. If the application stops working, it won't send the
26 | classification, so we wouldn't know how to classify it. To mitigate this issue this module also
27 | caches all the classifications of input events which are already classified.
28 | This way it can classify the events even if the application goes down if they were called before.
29 |
30 | Also, this cache can be initialized with defined values on startup, so that we can correctly classify events even for application which does not provide us with the classification by themselves.
31 |
32 |
33 | #### `moduleConfig`
34 | ```yaml
35 | # Paths to CSV files containing exact match classification rules.
36 | exactMatchesCsvFiles: []
37 | # Paths to CSV files containing regexp match classification rules.
38 | regexpMatchesCsvFiles:
39 | - "conf/userportal.csv"
40 | # Metadata key names of the event which will be added to the `events_processed_total` metric if the event cannot be classified.
41 | # Name of the resulting label will be converted to snake case and prefixed with `metadata_`
42 | unclassifiedEventMetadataKeys:
43 | - "userAgent"
44 | ```
45 |
46 | ##### Example of the CSV with exact classification:
47 | ```csv
48 | test-domain,test-app,test-class,"GET:/testing-endpoint"
49 | ```
50 |
51 | ##### Example of the CSV with regexp classification:
52 | ```csv
53 | test-domain,test-app,test-class,"/api/test/.*"
54 | test-domain,test-app,test-class-all,"/api/.*"
55 | ```
56 |
57 | ##### CSV comments
58 | CSV configuration files support single line comments. Comment has to start with the `#` character with no leading whitespaces.
59 | Example:
60 | ```csv
61 | # Example of comment
62 | test-domain,test-app,test-class,"/api/test/.*"
63 | ```
64 |
65 |
66 |
--------------------------------------------------------------------------------
/docs/modules/event_key_generator.md:
--------------------------------------------------------------------------------
1 | # Event key generator
2 |
3 | | | |
4 | |----------------|---------------------|
5 | | `moduleName` | `eventKeyGenerator` |
6 | | Module type | `processor` |
7 | | Input event | `raw` |
8 | | Output event | `raw` |
9 |
10 | This module allows you to generate an identifier of the event type.
11 | It will join all values of specified event metadata keys (if found) using the separator
12 | and use it as the new identifier.
13 |
14 | `moduleConfig`
15 | ```yaml
16 | # Separator to be used to join the selected metadata values.
17 | filedSeparator: ":"
18 | # If the event key should be overwritten if it's already set for the input event.
19 | overrideExistingEventKey: true
20 | # Keys which values will be joined as the resulting eventKey in specified order
21 | metadataKeys:
22 | -
23 | ```
24 |
25 | If given metadata_key is missing in the event's metadata, the empty value is not included in the resulting eventKey.
26 |
27 | E.g. given the following configuration:
28 | ```
29 | metadataKeys:
30 | app: test_app
31 | name: test_name
32 | endpoint: test_endpoint
33 | ```
34 | The following metadata `{'app': 'test_app', 'endpoint': 'test_endpoint'}` would result to event_key `test_app:test_endpoint`.
35 |
36 |
--------------------------------------------------------------------------------
/docs/modules/event_metadata_renamer.md:
--------------------------------------------------------------------------------
1 | # Event metadata renamer
2 |
3 | *Module status is _experimental_, it may be modified or removed even in non-major release.*
4 |
5 | | | |
6 | |----------------|------------------------|
7 | | `moduleName` | `eventMetadataRenamer` |
8 | | Module type | `processor` |
9 | | Input event | `raw` |
10 | | Output event | `raw` |
11 |
12 | This module allows you to modify the event metadata by renaming its keys. Refusals of overriding an already existing _destination_ are reported as a Warning log as well as within exposed Prometheus' metric.
13 |
14 | `moduleConfig`
15 | ```yaml
16 | eventMetadataRenamerConfigs:
17 | - source: keyX
18 | destination: keyY
19 | ```
20 |
--------------------------------------------------------------------------------
/docs/modules/kafka_ingester.md:
--------------------------------------------------------------------------------
1 | # Kafka ingester
2 |
3 | | | |
4 | |----------------|-------------------------|
5 | | `moduleName` | `kafkaIngester` |
6 | | Module type | `producer` |
7 | | Output event | `raw` |
8 |
9 | Kafka ingester generates events from Kafka messages.
10 |
11 | `moduleConfig`
12 | ```yaml
13 | # Allow verbose logging of events within Kafka library. Global logger with its configured log level is used.
14 | logKafkaEvents: false
15 | # Allow logging of errors within Kafka library. Global logger with its configured log level is used.
16 | logKafkaErrors: true
17 | # List of Kafka brokers
18 | brokers:
19 | - # e.g. kafka-1.example.com:9092
20 | topic: ""
21 | groupId: ""
22 | # commitInterval indicates the interval at which offsets are committed to the broker.
23 | # If 0 (default), commits will be handled synchronously.
24 | commitInterval: # e.g. 0, 5s, 10m
25 | # retentionTime optionally sets the length of time the consumer group will be saved by the broker.
26 | # Default: 24h
27 | retentionTime:
28 | # fallbackStartOffset determines from whence the consumer group should begin consuming when it finds a partition without a committed offset.
29 | # Default: FirstOffset
30 | fallbackStartOffset:
31 | ```
32 |
33 |
34 | For every received message from Kafka:
35 | - data in Key is ignored
36 | - data in Value is unmarshalled according to the schema version specified in Kafka message header `slo-exporter-schema-version` (defaults to `v1` if none specified).
37 |
38 | ### Supported data schemas
39 | #### `v1`
40 | ```
41 | {
42 | "metadata": {
43 | "name": "eventName"
44 | ...
45 | },
46 | # Defaults to 1 if none specified
47 | "quantity": "10",
48 | "slo_classification": {
49 | "app": "testApp",
50 | "class": "critical",
51 | "domain": "testDomain"
52 | }
53 | }
54 | ```
55 |
56 | Strictly speaking, none of the keys is mandatory, however please note that:
57 | - Event with explicitly set quantity=0 is basically noop for Producer module. To give an example, prometheusExporter does not increment any SLO metric for such events.
58 | - Event with empty Metadata does not allow much logic in following modules.
59 | - In case you want to allow ingesting events without SLO classification, you need to make sure that all events are classified within rest of the SLO exporter pipeline.
60 |
--------------------------------------------------------------------------------
/docs/modules/metadata_classifier.md:
--------------------------------------------------------------------------------
1 | # Metadata classifier
2 |
3 | | | |
4 | |----------------|------------------------------|
5 | | `moduleName` | `metadataClassifier` |
6 | | Module type | `processor` |
7 | | Input event | `raw` |
8 | | Output event | `raw` |
9 |
10 | This module allows you to classify an event using its metadata.
11 | Specify keys which values will be used as according slo classification items.
12 | If the key cannot be found, original value of classification will be left intact.
13 | By default, the module will override event classification.
14 | This can be disabled to classify it only if it wasn't classified before.
15 |
16 | `moduleConfig`
17 | ```yaml
18 | # Key of metadata value to be used as classification slo domain.
19 | sloDomainMetadataKey:
20 | # Key of metadata value to be used as classification slo domain.
21 | sloClassMetadataKey:
22 | # Key of metadata value to be used as classification slo domain.
23 | sloAppMetadataKey:
24 | # If classification of already classified event should be overwritten.
25 | overrideExistingValues: true
26 | ```
27 |
--------------------------------------------------------------------------------
/docs/modules/prometheus_exporter.md:
--------------------------------------------------------------------------------
1 | # Prometheus exporter
2 |
3 | | | |
4 | |----------------|----------------------|
5 | | `moduleName` | `prometheusExporter` |
6 | | Module type | `ingester` |
7 | | Input event | `SLO` |
8 |
9 | This module exposes the SLO metrics in Prometheus format, so they can be
10 | scraped, computed, visualized and alerted on.
11 |
12 | SLO is often computed over long time ranges such as 4 weeks.
13 | But on the other hand, for debugging it is essential to be able to distinct what event type
14 | caused the issue. To allow this, this exporter exposes cascade of aggregated metrics (see the example below).
15 | From the highest level over whole slo domain to the lowest granularity of each event type.
16 |
17 | This way the alerting and usual visualization can use the high level metrics, but in case of issues
18 | it's possible to drill down right to the root cause.
19 |
20 | The `normalizer` module is intended to mitigate possible issues witch exploding of event type cardinality.
21 | But to make sure, if any unique event type slips through, to avoid the cardinality explosion,
22 | the module allows to set maximum limit of exposed event types. any other new will be replaces with configured placeholder.
23 |
24 | `moduleConfig`
25 | ```yaml
26 | # Name of the resulting counter metric to be exposed representing counter of slo events by it's classification and result.
27 | metricName: "slo_events_total"
28 | # Limit of unique event keys, when exceeded, the event key in the label is replaced with placeholder.
29 | maximumUniqueEventKeys: 1000
30 | # Placeholder to replace new event keys when the limit is hit.
31 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
32 | # *Experimental* List of original raw event metadata keys to be added as an exemplars labels.
33 | ExemplarMetadataKeys: ["trace-id"]
34 | # Names of labels to be used for specific event information.
35 | labelNames:
36 | # Contains information about the event result (success, fail, ...).
37 | result: "result"
38 | # Domain of the SLO event.
39 | sloDomain: "slo_domain"
40 | # SLO class of the event.
41 | sloClass: "slo_class"
42 | # Application, to which the event belongs.
43 | sloApp: "slo_app"
44 | # Unique identifier of the event.
45 | # This label holds value of Key attribute of the input SLO event
46 | eventKey: "event_key"
47 | ```
48 |
49 | ## Exposed metrics example
50 | Given the default configuration as specified above, the resulting exposed metrics will be as follows:
51 | ```
52 | slo_domain:slo_events_total{result=~"success|fail",slo_domain="__domain_name__"}
53 | slo_domain_slo_class:slo_events_total{result=~"success|fail",slo_domain="__domain_name__",slo_class="__slo_class__"}
54 | slo_domain_slo_class_slo_app:slo_events_total{result=~"success|fail",slo_domain="__domain_name__",slo_class="__slo_class__",slo_app="__slo_app__"}
55 | slo_domain_slo_class_slo_app_event_key:slo_events_total{result=~"success|fail",slo_domain="__domain_name__",slo_class="__slo_class__",slo_app="__slo_app__",event_key="__event_key__"}
56 | ```
57 |
58 | Each of the timeseries will have additional labels which are (optionally) specified in [sloEventProducer](./slo_event_producer.md) rules configuration (as `additional_metadata`) - for example slo_version, slo_type,...
59 |
--------------------------------------------------------------------------------
/docs/modules/relabel.md:
--------------------------------------------------------------------------------
1 | # Relabel
2 |
3 | | | |
4 | |----------------|--------------|
5 | | `moduleName` | `relabel` |
6 | | Module type | `processor` |
7 | | Input event | `raw` |
8 | | Output event | `raw` |
9 |
10 | This module allows you to modify the event metadata or drop the event at all.
11 | It uses native Prometheus `relabel_config` syntax. In this case metadata is referred as labels.
12 | See [the upstream documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config)
13 | for more info. Referenced metadata keys needs to be a valid Prometheus' label name.
14 |
15 |
16 | `moduleConfig`
17 | ```yaml
18 | eventRelabelConfigs:
19 | -
20 | ```
21 |
22 | You can find some [examples here](/examples).
23 |
--------------------------------------------------------------------------------
/docs/modules/statistical_classifier.md:
--------------------------------------------------------------------------------
1 | # Statistical classifier
2 |
3 | | | |
4 | |----------------|-------------------------|
5 | | `moduleName` | `statisticalClassifier` |
6 | | Module type | `processor` |
7 | | Input event | `raw` |
8 | | Output event | `raw` |
9 |
10 | This module watches observes statistical distribution of all incoming already classified events.
11 | This distribution is then used to classify incoming unclassified events.
12 | It produces only classified events, if any error or issue is encountered, the event is dropped.
13 | You can specify default weights which will be used if there were no events recently (at least for interval specified in `historyWindowSize`) to calculate the weights from.
14 |
15 | This module allows you to ensure no events will be dropped just because they were not classified.
16 | Of course the precision is based on the previously observed data but it is still better than drop the events completely.
17 |
18 | Applicable for example in the following cases:
19 |
20 | - Application usually sends its event identifier within HTTP headers.
21 | In cases where communication is interrupted in a way that this header is not sent
22 | (e.g. HTTP 5xx or 499 status codes), we have no way how to identify (and thus classify) the event.
23 |
24 |
25 | `moduleConfig`
26 | ```yaml
27 | # Time interval from which calculate the distribution used for classification.
28 | historyWindowSize: "30m"
29 | # How often the weights calculated over the historyWindowSize will be updated.
30 | historyWeightUpdateInterval: "1m"
31 | # Default weights to be used in case that there were no events recently to deduce the real weights.
32 | defaultWeights:
33 | -
34 | ```
35 |
36 | `classificationWeight`
37 | ```yaml
38 | # Dimensionless number to be compared with other default weights.
39 | weight:
40 | # Classification to be guessed with the specified weight.
41 | classification:
42 | sloDomain:
43 | sloClass:
44 | ```
45 |
--------------------------------------------------------------------------------
/docs/modules/tailer.md:
--------------------------------------------------------------------------------
1 | # Tailer
2 |
3 | | | |
4 | |----------------|-------------|
5 | | `moduleName` | `tailer` |
6 | | Module type | `producer` |
7 | | Output event | `raw` |
8 |
9 | This module is able to tail file and parse each line using regular expression with named groups.
10 | Those group names are used as metadata keys of the produces event and values are the matching strings.
11 |
12 | It persists the last read position to file, so it can continue if restarted.
13 |
14 | It can be used for example to tail proxy log and create events from it
15 | so you can calculate SLO for your HTTP servers etc.
16 |
17 | `moduleConfig`
18 | ```yaml
19 | # Path to file to be processed.
20 | tailedFile: "/logs/access_log"
21 | # If tailed file should be followed for new lines once all current lines are processed.
22 | follow: true
23 | # If tailed file should be reopened.
24 | reopen: true
25 | # Path to file where to persist position of tailing.
26 | positionFile: ""
27 | # How often current position should be persisted to the position file.
28 | positionPersistenceInterval: "2s"
29 | # Defines RE which is used to parse the log line.
30 | # Currently known named groups which are used to extract information for generated Events are:
31 | # sloDomain - part of SLO classification for the given event.
32 | # sloApp - part of SLO classification for the given event.
33 | # sloClass - part of SLO classification for the given event.
34 | # All other named groups will be added to to the request event as event.Metadata.
35 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) \S+ \S+ \[(?P.*?)\] "(?P.*?)" (?P\d+) \d+ "(?P.*?)" uag="(?P[^"]+)" "[^"]+" ua="[^"]+" rt="(?P\d+(\.\d+)??)".+ignore-slo="(?P[^"]*)" slo-domain="(?P[^"]*)" slo-app="(?P[^"]*)" slo-class="(?P[^"]*)" slo-endpoint="(?P[^"]*)" slo-result="(?P[^"]*)"' # emptyGroupRE defines RE used to decide whether some of the RE match groups specified in loglineParseRegexp is empty and this its assigned variable should be kept unitialized
36 | # Value, that will be treated as empty value.
37 | emptyGroupRE: '^-$'
38 | ```
39 |
40 |
--------------------------------------------------------------------------------
/docs/operating.md:
--------------------------------------------------------------------------------
1 | # Operating
2 |
3 | ## Debugging
4 | If you need to dynamically change the log level of the application, you can use the `/logging` HTTP endpoint.
5 | To set the log level use the `POST` method with URL parameter `level` of value `error`, `warning`, `info` or `debug`.
6 |
7 | Example using `cURL`
8 | ```bash
9 | # Use GET to get current log level.
10 | $ curl -s http://0.0.0.0:8080/logging
11 | current logging level is: debug
12 |
13 | # Use POST to set the log level.
14 | $ curl -XPOST -s http://0.0.0.0:8080/logging?level=info
15 | logging level set to: info
16 | ```
17 |
18 | #### Profiling
19 | In case of issues with leaking resources for example, slo-exporter supports the
20 | Go profiling using pprof on `/debug/pprof/` web interface path. For usage see the official [docs](https://golang.org/pkg/net/http/pprof/).
21 |
22 |
23 | ## Frequently asked questions
24 |
25 | ### How to add new normalization replacement rule?
26 | Event normalization can be done using the `relabel` module, see [its documentation](modules/relabel.md).
27 |
28 | ### How to deal with malformed lines?
29 | Before !87. If you are seeing too many malformed lines then you should inspect [tailer package](pkg/tailer/tailer.go) and seek for variable `lineParseRegexp`.
30 | After !87, slo-exporter main config supports to specify custom regular expression in field `.module.tailer.loglineParseRegexp`.
31 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | Here you can find example configurations of slo-exporter for distinct use-cases.
3 | Each example should have its own README.md with description what the example does and how the slo-exporter is actually configured.
4 |
--------------------------------------------------------------------------------
/examples/all_in_one/README.md:
--------------------------------------------------------------------------------
1 | # All-in-one example
2 |
3 | ### Overview
4 | Use the provided [docker-compose](./docker-compose.yaml) to start the complete setup with
5 | Prometheus instance loaded with [example SLO recording rules and alerts](/prometheus),
6 | and Grafana instance with loaded [SLO dashboards](/grafana_dashboards).
7 |
8 | Description of the whole setup follows:
9 | - **Nginx configured with the following paths:**
10 | - `nginx:8080/` -> `HTTP 200`, all ok
11 | - `nginx:8080/err` -> `HTTP 500`, availability violation
12 | - `nginx:8080/drop`-> `limit 1r/m`, latency violation
13 | - **Slo-exporter configured to tail the Nginx's logs**
14 | - **Prometheus**
15 | - configured to scrape the slo-exporter's metrics
16 | - loaded with necessary recording-rules for SLO computation
17 | - **Grafana**
18 | - with Prometheus preconfigured as a datasource
19 | - loaded with [SLO dashboards](/grafana_dashboards/)
20 | - **Slo-event-generator**
21 | - an infinite loop accessing the Nginx instance to generate slo-events.
22 |
23 | ### How to run it
24 | ```bash
25 | docker-compose pull && docker-compose up
26 | ```
27 |
28 | To access Grafana and Prometheus:
29 | ```
30 | # http://localhost:9090 Prometheus
31 | # http://localhost:3000 Grafana
32 | # User: admin
33 | # Password: admin
34 | ```
35 |
36 | **Please note that it may take up to 5 minutes until Grafana dashboards will show any data. This is caused by evaluation interval of the included Prometheus recording rules.**
37 |
--------------------------------------------------------------------------------
/examples/all_in_one/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | services:
4 | nginx:
5 | image: nginx
6 | volumes:
7 | - "./nginx/conf/nginx.conf:/etc/nginx/nginx.conf:ro"
8 | - "./nginx/static:/nginx/static:ro"
9 | - "nginx-logs:/nginx/logs/"
10 |
11 | slo-exporter:
12 | image: seznam/slo-exporter:latest
13 | depends_on:
14 | - nginx
15 | ports:
16 | - 8001:8001
17 | working_dir: /slo-exporter
18 | command:
19 | - "--config-file=/slo-exporter/conf/slo_exporter.yaml"
20 | volumes:
21 | - ./slo-exporter/conf:/slo-exporter/conf/
22 | - nginx-logs:/logs/
23 |
24 | slo-event-generator:
25 | image: nginx
26 | entrypoint: /bin/bash
27 | command: -c 'while true; do
28 | for i in `seq 20`; do curl -s http://nginx:8080/ >/dev/null 2>&1 ; done;
29 | for i in `seq $$(($$RANDOM % 3))`; do curl -s http://nginx:8080/err >/dev/null 2>&1 ; done;
30 | curl -m 1 -s http://nginx:8080/drop >/dev/null 2>&1 >/dev/null || true;
31 | echo -n ".";
32 | sleep 5;
33 | done'
34 |
35 | prometheus:
36 | image: prom/prometheus:latest
37 | depends_on:
38 | - slo-exporter
39 | ports:
40 | - 9090:9090
41 | environment:
42 | PROMETHEUS_CONFIG: |
43 | {
44 | "scrape_configs":[{
45 | "job_name": "slo-exporter",
46 | "scrape_interval": "2s",
47 | "static_configs":[
48 | {"targets":["slo-exporter:8001"]},
49 | ],
50 | }],
51 | "rule_files": ["/prometheus/recording_rules/*yaml", "/prometheus/alerts/*yaml"]
52 | }
53 | entrypoint: ["sh"]
54 | command:
55 | - "-c"
56 | - 'echo $$PROMETHEUS_CONFIG > /etc/prometheus/prometheus.yml; prometheus --config.file=/etc/prometheus/prometheus.yml'
57 | volumes:
58 | - ./prometheus/recording_rules:/prometheus/recording_rules
59 | - ./prometheus/alerts:/prometheus/alerts
60 |
61 | grafana:
62 | image: grafana/grafana:8.3.3
63 | depends_on:
64 | - prometheus
65 | ports:
66 | - 3000:3000
67 | volumes:
68 | - ./grafana/provisioning/:/etc/grafana/provisioning/
69 |
70 | volumes:
71 | nginx-logs:
72 |
--------------------------------------------------------------------------------
/examples/all_in_one/example-domain-slo-conf.yaml:
--------------------------------------------------------------------------------
1 | ../../tools/slo-rules-generator/all-in-one-example-domain.yaml
--------------------------------------------------------------------------------
/examples/all_in_one/grafana/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'Prometheus'
5 | orgId: 1
6 | folder: 'SLO'
7 | type: file
8 | disableDeletion: false
9 | editable: true
10 | options:
11 | path: /etc/grafana/provisioning/dashboards
12 |
--------------------------------------------------------------------------------
/examples/all_in_one/grafana/provisioning/datasources/datasource.yml:
--------------------------------------------------------------------------------
1 | # config file version
2 | apiVersion: 1
3 |
4 | # list of datasources that should be deleted from the database
5 | deleteDatasources:
6 | - name: Prometheus
7 | orgId: 1
8 |
9 | # list of datasources to insert/update depending
10 | # what's available in the database
11 | datasources:
12 | # name of the datasource. Required
13 | - name: Prometheus
14 | # datasource type. Required
15 | type: prometheus
16 | # access mode. proxy or direct (Server or Browser in the UI). Required
17 | access: proxy
18 | # org id. will default to orgId 1 if not specified
19 | orgId: 1
20 | # custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically
21 | uid: my_unique_uid
22 | # url
23 | url: http://prometheus:9090
24 |
--------------------------------------------------------------------------------
/examples/all_in_one/nginx/conf/nginx.conf:
--------------------------------------------------------------------------------
1 | events {
2 | worker_connections 1024;
3 | }
4 |
5 | http {
6 | server_tokens off;
7 | include mime.types;
8 | charset utf-8;
9 |
10 | log_format upstream_time '$remote_addr - $remote_user [$time_local] '
11 | '"$request" $status $body_bytes_sent '
12 | '"$http_referer" "$http_user_agent" '
13 | 'rt=$request_time uct="$upstream_connect_time" uht="$upstream_header_time" urt="$upstream_response_time"';
14 |
15 | access_log /nginx/logs/access_log upstream_time;
16 |
17 | limit_req_zone $binary_remote_addr zone=one:10m rate=1r/m;
18 |
19 | server {
20 | server_name localhost;
21 | listen 0.0.0.0:8080;
22 |
23 | set $content_class static;
24 | location / {
25 | return 200;
26 | }
27 |
28 | location /err {
29 | return 500;
30 | }
31 |
32 | location /drop {
33 | # delay incoming requests so that the client will timeout
34 | limit_req zone=one burst=5;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/examples/all_in_one/prometheus/alerts:
--------------------------------------------------------------------------------
1 | ../../../prometheus/alerts/
--------------------------------------------------------------------------------
/examples/all_in_one/prometheus/recording_rules:
--------------------------------------------------------------------------------
1 | ../../../prometheus/recording_rules/
--------------------------------------------------------------------------------
/examples/all_in_one/slo-exporter/conf/classification.csv:
--------------------------------------------------------------------------------
1 | example-domain,example-app,critical,"^(GET|POST|HEAD|PUT|DELETE):.*"
2 |
--------------------------------------------------------------------------------
/examples/all_in_one/slo-exporter/conf/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8001"
2 | maximumGracefulShutdownDuration: "10s"
3 | afterPipelineShutdownDelay: "1s"
4 |
5 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "dynamicClassifier", "sloEventProducer", "prometheusExporter"]
6 |
7 | modules:
8 |
9 | tailer:
10 | tailedFile: "/logs/access_log"
11 | follow: true
12 | reopen: true
13 | positionFile: ""
14 | positionPersistenceInterval: "2s"
15 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) - \S+ \[(?P.*?)\] "\s*(?PGET|POST|HEAD|UPDATE|DELETE|PUT|CONNECT|OPTIONS|TRACE|PATCH)\s+(?P[^\s]+)\s+(?P[^\s]+)?\s*" (?P\d+) \d+ "(?P.*?)" "(?P[^"]+)" rt=(?P\d+(\.\d+)??) uct="[^"]+" uht="[^"]+" urt="[^"]+"'
16 | emptyGroupRE: '^-$'
17 |
18 | relabel:
19 | eventRelabelConfigs:
20 | # Drop events with unwanted status codes
21 | - source_labels: ["statusCode"]
22 | regex: "30[12]|40[045]|411|408|499"
23 | action: drop
24 |
25 | eventKeyGenerator:
26 | filedSeparator: ":"
27 | metadataKeys:
28 | - httpMethod
29 | - httpPath
30 |
31 | dynamicClassifier:
32 | regexpMatchesCsvFiles:
33 | - "conf/classification.csv"
34 |
35 | sloEventProducer:
36 | rulesFiles:
37 | - "conf/slo_rules.yaml"
38 |
39 | prometheusExporter:
40 | metricName: "slo_events_total"
41 | maximumUniqueEventKeys: 1000
42 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
43 | labelNames:
44 | result: "result"
45 | sloDomain: "slo_domain"
46 | sloClass: "slo_class"
47 | sloApp: "slo_app"
48 | eventKey: "event_key"
49 |
--------------------------------------------------------------------------------
/examples/all_in_one/slo-exporter/conf/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: example-domain
4 | failure_conditions:
5 | - operator: numberIsEqualOrHigherThan
6 | key: statusCode
7 | value: 500
8 | additional_metadata:
9 | slo_type: availability
10 | slo_version: 1
11 | namespace: test
12 | cluster: test-cluster
13 |
14 | - slo_matcher:
15 | domain: example-domain
16 | class: critical
17 | failure_conditions:
18 | - operator: numberIsHigherThan
19 | key: requestDuration
20 | value: 0.8
21 | additional_metadata:
22 | slo_version: 1
23 | slo_type: latency90
24 | percentile: 90
25 | le: 0.8
26 | namespace: test
27 | cluster: test-cluster
28 |
29 | - slo_matcher:
30 | domain: example-domain
31 | class: critical
32 | failure_conditions:
33 | - operator: numberIsHigherThan
34 | key: requestDuration
35 | value: 2
36 | additional_metadata:
37 | slo_version: 1
38 | slo_type: latency99
39 | percentile: 99
40 | le: 2
41 | namespace: test
42 | cluster: test-cluster
43 |
--------------------------------------------------------------------------------
/examples/envoy_proxy/README.md:
--------------------------------------------------------------------------------
1 | # Envoy proxy SLO example
2 |
3 | This example shows a simple configuration of slo-exporter using
4 | [`envoy access-log-server module`](/docs/modules/envoy_access_log_server.md).
5 |
6 | #### How to run it
7 | In root of the repo
8 | ```bash
9 | make docker
10 | cd examples/envoy_proxy
11 | docker-compose up -d
12 | ```
13 | Once started, see http://localhost:8001/metrics.
14 |
15 | ## How SLO is computed
16 | - [envoyAccessLogServer module](/docs/modules/envoy_access_log_server.md) is used to receive envoy's logs via grpc.
17 | - [relabel module](/docs/modules/relabel.md) drops the unwanted events (e.g. based on its HTTP status code, userAgent,...).
18 | - [metadataClassifier module](/docs/modules/metadata_classifier.md) classifies generated event based HTTP headers sent by a client
19 |
20 | ## Observed SLO types
21 | Refer to [slo_rules.yaml](./slo-exporter/slo_rules.yaml) for the exact configuration of how SLO events are generated based on input logs/events.
22 |
23 | #### `availability`
24 | For every log line which results in classified event in domain `test-domain`, an SLO event is generated. Its result is determined based on statusCode metadata key - with all events with `statusCode > 500` being marked as failed.
25 |
26 | #### `latency90`, `latency99`
27 | For every log line which results in classified event in domain `test-domain` and slo_class `critical`, an SLO event is generated. Its result is determined based on `timeToLastDownstreamTxByte` metadata key.
28 |
--------------------------------------------------------------------------------
/examples/envoy_proxy/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | services:
4 | envoy:
5 | network_mode: "host"
6 | image: envoyproxy/envoy:v1.16-latest
7 | volumes:
8 | - "./envoy/envoy.yaml:/conf/envoy.yaml:ro"
9 | command:
10 | - "-c"
11 | - "/conf/envoy.yaml"
12 |
13 | slo-exporter:
14 | network_mode: "host"
15 | image: slo_exporter:latest
16 | working_dir: /slo-exporter
17 | command:
18 | - "--config-file=/slo-exporter/slo_exporter.yaml"
19 | - "--log-level=debug"
20 | volumes:
21 | - ./slo-exporter/:/slo-exporter/
22 |
23 | slo-event-generator:
24 | network_mode: "host"
25 | image: curlimages/curl
26 | entrypoint: /bin/sh
27 | command: |
28 | -c 'while true; do
29 | for i in `seq 20`; do curl -s -H "slo-domain: example-domain" -H "slo-class: critical" -H "slo-app: homepage-static" http://localhost:8080/ >/dev/null 2>&1 ; done;
30 | echo -n ".";
31 | sleep 5;
32 | done'
--------------------------------------------------------------------------------
/examples/envoy_proxy/envoy/envoy.yaml:
--------------------------------------------------------------------------------
1 | static_resources:
2 |
3 | listeners:
4 | - name: listener_0
5 | address:
6 | socket_address:
7 | address: 0.0.0.0
8 | port_value: 8080
9 | filter_chains:
10 | - filters:
11 | - name: envoy.filters.network.http_connection_manager
12 | typed_config:
13 | "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
14 | stat_prefix: ingress_http
15 | access_log:
16 | - name: envoy.access_loggers.file
17 | typed_config:
18 | "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
19 | path: /dev/stdout
20 | - name: envoy.access_loggers.http_grpc
21 | typed_config:
22 | "@type": type.googleapis.com/envoy.extensions.access_loggers.grpc.v3.HttpGrpcAccessLogConfig
23 | common_config:
24 | grpc_service:
25 | envoy_grpc:
26 | cluster_name: service_accesslog
27 | buffer_size_bytes:
28 | value: 0
29 | log_name: accesslogv3
30 | transport_api_version: V3
31 | additional_request_headers_to_log: ['slo-class', 'slo-domain', 'slo-app']
32 | http_filters:
33 | - name: envoy.filters.http.router
34 | route_config:
35 | name: local_route
36 | virtual_hosts:
37 | - name: local_service
38 | domains: ["*"]
39 | routes:
40 | - match:
41 | prefix: "/"
42 | route:
43 | cluster: service_neverssl_com
44 | host_rewrite_literal: neverssl.com
45 |
46 | clusters:
47 | - name: service_accesslog
48 | connect_timeout: 6s
49 | type: LOGICAL_DNS
50 | load_assignment:
51 | cluster_name: service_accesslog
52 | endpoints:
53 | - lb_endpoints:
54 | - endpoint:
55 | address:
56 | socket_address:
57 | address: localhost
58 | port_value: 18090
59 | http2_protocol_options: {}
60 |
61 | - name: service_neverssl_com
62 | connect_timeout: 30s
63 | type: LOGICAL_DNS
64 | dns_lookup_family: V4_ONLY
65 | load_assignment:
66 | cluster_name: service_neverssl_com
67 | endpoints:
68 | - lb_endpoints:
69 | - endpoint:
70 | address:
71 | socket_address:
72 | address: neverssl.com
73 | port_value: 80
74 |
--------------------------------------------------------------------------------
/examples/envoy_proxy/slo-exporter/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8001"
2 | maximumGracefulShutdownDuration: "10s"
3 | afterPipelineShutdownDelay: "1s"
4 |
5 | pipeline: ["envoyAccessLogServer", "relabel", "eventKeyGenerator", "metadataClassifier", "sloEventProducer", "prometheusExporter"]
6 |
7 | modules:
8 |
9 | envoyAccessLogServer: {}
10 |
11 | relabel:
12 | eventRelabelConfigs:
13 | # Drop events with unwanted status codes
14 | - source_labels: ["responseCode"]
15 | regex: "30[12]|40[045]|411|408|499"
16 | action: drop
17 |
18 | eventKeyGenerator:
19 | filedSeparator: ":"
20 | metadataKeys:
21 | - requestMethod
22 | - path
23 |
24 | metadataClassifier:
25 | sloDomainMetadataKey: http_slo-domain
26 | sloClassMetadataKey: http_slo-class
27 | sloAppMetadataKey: http_slo-app
28 |
29 | sloEventProducer:
30 | rulesFiles:
31 | - "slo_rules.yaml"
32 |
33 | prometheusExporter:
34 | metricName: "slo_events_total"
35 | labelNames:
36 | result: "result"
37 | sloDomain: "slo_domain"
38 | sloClass: "slo_class"
39 | sloApp: "slo_app"
40 | eventKey: "event_key"
41 |
--------------------------------------------------------------------------------
/examples/envoy_proxy/slo-exporter/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: example-domain
4 | failure_conditions:
5 | - operator: numberIsEqualOrHigherThan
6 | key: responseCode
7 | value: 500
8 | additional_metadata:
9 | slo_type: availability
10 | slo_version: 1
11 | namespace: test
12 |
13 | - slo_matcher:
14 | domain: example-domain
15 | class: critical
16 | failure_conditions:
17 | - operator: durationIsHigherThan
18 | key: timeToLastDownstreamTxByte
19 | value: 10ms
20 | additional_metadata:
21 | slo_version: 1
22 | slo_type: latency90
23 | percentile: 90
24 | le: 0.01
25 | namespace: test
26 |
27 | - slo_matcher:
28 | domain: example-domain
29 | class: critical
30 | failure_conditions:
31 | - operator: durationIsHigherThan
32 | key: timeToLastDownstreamTxByte
33 | value: 50ms
34 | additional_metadata:
35 | slo_version: 1
36 | slo_type: latency99
37 | percentile: 99
38 | le: 0.05
39 | namespace: test
40 |
--------------------------------------------------------------------------------
/examples/kafka/README.md:
--------------------------------------------------------------------------------
1 | # Kafka ingester SLO example
2 |
3 | This example shows a simple configuration of slo-exporter using
4 | [`kafka_ingester`](/docs/modules/kafka_ingester.md)
5 | as a source of data in order to compute SLO of a server which publishes events through Kafka.
6 |
7 | #### How to run it
8 | In root of the repo
9 | ```bash
10 | make docker
11 | cd examples/kafka
12 | docker compose up -d
13 | ```
14 | Once started see http://localhost:8080/metrics.
15 |
16 | ## How SLO is computed
17 | Kafkacat is used to publish events to Kafka on behalf of an imaginary server. Each event contains its SLO classification together with metadata.
18 |
19 | ## Observed SLO types
20 | #### `availability`
21 | All events whose "result" metadata's key equals to "OK" are considered successful.
22 |
23 | #### `quality`
24 | All events whose all quality degradation tracking metadata's key(s) equals to 0 are considered successful.
25 |
--------------------------------------------------------------------------------
/examples/kafka/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | slo-exporter:
4 | image: slo_exporter
5 | depends_on:
6 | - topic-initialization
7 | ports:
8 | - 8080:8080
9 | working_dir: /slo-exporter
10 | command:
11 | - "--config-file=/slo-exporter/slo_exporter.yaml"
12 | volumes:
13 | - ./slo-exporter/:/slo-exporter/
14 |
15 | topic-initialization:
16 | image: confluentinc/cp-kafka:6.0.2
17 | command: kafka-topics --create --topic slo-exporter --partitions 4 --replication-factor 2 --if-not-exists --bootstrap-server kafka-1:9092
18 | depends_on:
19 | - kafka-1
20 | - kafka-2
21 | - kafka-3
22 |
23 | kafkacat:
24 | image: confluentinc/cp-kafkacat
25 | command: |
26 | bash -c "
27 | while true;
28 | do
29 | echo '{\"quantity\": 1, \"slo_classification\": {\"app\": \"fooApp\", \"domain\": \"testDomain\", \"class\": \"critical\"}, \"metadata\": {\"name\": \"foo\", \"degradation_slave_response\": \"1\", \"result\": \"success\"}}' | kafkacat -P -b kafka-1:9092,kafka-2:9092,kafka-3:9092 -t slo-exporter -p 0
30 | sleep 1
31 | done"
32 | depends_on:
33 | - topic-initialization
34 |
35 | zookeeper-1:
36 | image: confluentinc/cp-zookeeper:6.0.2
37 | environment:
38 | ZOOKEEPER_SERVER_ID: 1
39 | ZOOKEEPER_CLIENT_PORT: 22181
40 | ZOOKEEPER_TICK_TIME: 2000
41 | ZOOKEEPER_INIT_LIMIT: 5
42 | ZOOKEEPER_SYNC_LIMIT: 2
43 | ZOOKEEPER_SERVERS: zookeeper-1:22888:23888;zookeeper-2:32888:33888;zookeeper-3:42888:43888
44 |
45 | zookeeper-2:
46 | image: confluentinc/cp-zookeeper:6.0.2
47 | environment:
48 | ZOOKEEPER_SERVER_ID: 2
49 | ZOOKEEPER_CLIENT_PORT: 32181
50 | ZOOKEEPER_TICK_TIME: 2000
51 | ZOOKEEPER_INIT_LIMIT: 5
52 | ZOOKEEPER_SYNC_LIMIT: 2
53 | ZOOKEEPER_SERVERS: zookeeper-1:22888:23888;zookeeper-2:32888:33888;zookeeper-3:42888:43888
54 |
55 | zookeeper-3:
56 | image: confluentinc/cp-zookeeper:6.0.2
57 | environment:
58 | ZOOKEEPER_SERVER_ID: 3
59 | ZOOKEEPER_CLIENT_PORT: 42181
60 | ZOOKEEPER_TICK_TIME: 2000
61 | ZOOKEEPER_INIT_LIMIT: 5
62 | ZOOKEEPER_SYNC_LIMIT: 2
63 | ZOOKEEPER_SERVERS: zookeeper-1:22888:23888;zookeeper-2:32888:33888;zookeeper-3:42888:43888
64 |
65 | kafka-1:
66 | image: confluentinc/cp-kafka:6.0.2
67 | depends_on:
68 | - zookeeper-1
69 | - zookeeper-2
70 | - zookeeper-3
71 | environment:
72 | KAFKA_BROKER_ID: 1
73 | KAFKA_ZOOKEEPER_CONNECT: zookeeper-1:22181,zookeeper-2:22181,zookeeper-3:22181
74 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-1:9092
75 |
76 | kafka-2:
77 | image: confluentinc/cp-kafka:6.0.2
78 | depends_on:
79 | - zookeeper-1
80 | - zookeeper-2
81 | - zookeeper-3
82 | environment:
83 | KAFKA_BROKER_ID: 2
84 | KAFKA_ZOOKEEPER_CONNECT: zookeeper-1:22181,zookeeper-2:22181,zookeeper-3:22181
85 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-2:9092
86 |
87 | kafka-3:
88 | image: confluentinc/cp-kafka:6.0.2
89 | depends_on:
90 | - zookeeper-1
91 | - zookeeper-2
92 | - zookeeper-3
93 | environment:
94 | KAFKA_BROKER_ID: 3
95 | KAFKA_ZOOKEEPER_CONNECT: zookeeper-1:22181,zookeeper-2:22181,zookeeper-3:22181
96 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-3:9092
97 |
--------------------------------------------------------------------------------
/examples/kafka/slo-exporter/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8080"
2 |
3 | pipeline: ["kafkaIngester", "eventKeyGenerator", "sloEventProducer", "prometheusExporter"]
4 |
5 | modules:
6 | kafkaIngester:
7 | brokers:
8 | - "kafka-1:9092"
9 | - "kafka-2:9092"
10 | - "kafka-3:9092"
11 | topic: slo-exporter
12 | groupId: slo-exporter
13 | logKafkaEvents: true
14 |
15 | eventKeyGenerator:
16 | metadataKeys:
17 | - "name"
18 |
19 | sloEventProducer:
20 | rulesFiles:
21 | - "slo_rules.yaml"
22 |
23 | prometheusExporter: {}
24 |
--------------------------------------------------------------------------------
/examples/kafka/slo-exporter/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - failure_conditions:
3 | - key: result
4 | operator: isNotEqualTo
5 | value: "success"
6 | additional_metadata:
7 | slo_type: availability
8 | slo_version: 1
9 |
10 | # Mark event as failed for slo_type: quality if any of the observed quality degradations occurred
11 | - failure_conditions:
12 | - key: degradation_slave_response
13 | operator: numberIsHigherThan
14 | value: 0
15 | additional_metadata:
16 | slo_type: quality
17 | slo_version: 1
18 |
--------------------------------------------------------------------------------
/examples/nginx_proxy/README.md:
--------------------------------------------------------------------------------
1 | # Nginx proxy SLO example
2 |
3 | This example shows a simple configuration of slo-exporter using
4 | [`tailer module`](/docs/modules/tailer.md) to parse
5 | Nginx proxy's logs as a source of data in order to compute SLO.
6 |
7 | #### How to run it
8 | In root of the repo
9 | ```bash
10 | make build
11 | cd examples/nginx_proxy
12 | ../../slo_exporter --config-file slo_exporter.yaml
13 | ```
14 | Once started, see http://localhost:8080/metrics.
15 |
16 | ## How SLO is computed
17 | - [tailer module](/docs/modules/tailer.md) is used to parse the logs. Note the `modules.tailer.loglineParseRegexp` configuration which needs to match the used Nginx log format.
18 | - [relabel module](/docs/modules/relabel.md) drops the unwanted events (e.g. based on its HTTP status code, userAgent,...), normalize URI and eventually set a new event's metadata key (see `operationName`). Not all of this may be needed in your use case, but we include it to present an example use of this module.
19 | - [dynamicClassifier module](/docs/modules/dynamicClassifier.md) classifies generated event based on provided [classification.csv](./classification.csv)
20 |
21 | ## Observed SLO types
22 | Refer to [slo_rules.yaml](./slo_rules.yaml) for the exact configuration of how SLO events are generated based on input logs/events.
23 |
24 | #### `availability`
25 | For every log line which results in classified event in domain `test-domain`, an SLO event is generated. Its result is determined based on statusCode metadata key - with all events with `statusCode > 500` being marked as failed.
26 |
27 |
28 | #### `latency90`
29 | For every log line which results in classified event in domain `test-domain` and slo_class `critical`, an SLO event is generated. Its result is determined based on requestDuration metadata key - with all events which took more than `0.8s` to process are being marked as failed. This SLO type represents 90th latency percentile - in other words, we expect that more then 90% of events meets this condition.
--------------------------------------------------------------------------------
/examples/nginx_proxy/classification.csv:
--------------------------------------------------------------------------------
1 | test-domain,test-app,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/.*$"
2 | test-domain,test-app,critical,"^(GET|POST|HEAD|PUT|DELETE):/api/v2/.*$"
3 | test-domain,test-app,critical,"^(GET|POST|HEAD|PUT|DELETE):/img2/.*$"
4 |
--------------------------------------------------------------------------------
/examples/nginx_proxy/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8080"
2 | maximumGracefulShutdownDuration: "10s"
3 | afterPipelineShutdownDelay: "1s"
4 |
5 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "dynamicClassifier", "statisticalClassifier", "sloEventProducer", "prometheusExporter"]
6 |
7 | modules:
8 |
9 | tailer:
10 | tailedFile: "test.log"
11 | follow: true
12 | reopen: true
13 | positionFile: ""
14 | positionPersistenceInterval: "2s"
15 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) \S+ \S+ \[(?P.*?)\] "\s*(?PGET|POST|HEAD|UPDATE|DELETE|PUT|CONNECT|OPTIONS|TRACE|PATCH)\s+(?P[^\s]+)\s+(?P[^\s]+)?\s*" (?P\d+) \d+ "(?P.*?)" uag="(?P[^"]+)" "[^"]+" ua="[^"]+" rt="(?P\d+(\.\d+)??)".+ignore-slo="(?P[^"]*)" slo-domain="(?P[^"]*)" slo-app="(?P[^"]*)" slo-class="(?P[^"]*)" slo-endpoint="(?P[^"]*)" slo-result="(?P[^"]*)"'
16 | emptyGroupRE: '^-$'
17 |
18 | relabel:
19 | eventRelabelConfigs:
20 | # Drop events with unwanted status codes
21 | - source_labels: ["statusCode"]
22 | regex: "30[12]|40[045]|411"
23 | action: drop
24 | # Drop events with unwanted user agent
25 | - source_labels: ["userAgent"]
26 | regex: "(?i)(?:sentry|blackbox-exporter|kube-probe)"
27 | action: drop
28 |
29 | # Parse out GET parameter from URI as separate label
30 | - source_labels: ["httpPath"]
31 | regex: '.*operationName=(.*)(?:&.*)$'
32 | action: replace
33 | target_label: "operationName"
34 |
35 | # Normalize numbers in URI
36 | - source_labels: ["httpPath"]
37 | regex: '(.*)/\d+(/(?:.*))?'
38 | action: replace
39 | target_label: "httpPath"
40 | replacement: "$1/0$2"
41 |
42 |
43 | eventKeyGenerator:
44 | filedSeparator: ":"
45 | overrideExistingEventKey: false
46 | metadataKeys:
47 | - httpMethod
48 | - httpPath
49 | - operationName
50 |
51 | dynamicClassifier:
52 | regexpMatchesCsvFiles:
53 | - "classification.csv"
54 | unclassifiedEventMetadataKeys:
55 | - "userAgent"
56 |
57 | statisticalClassifier:
58 | historyWindowSize: "30m"
59 | historyWeightUpdateInterval: "1m"
60 | defaultWeights:
61 | - weight: 1
62 | classification:
63 | sloDomain: "test-domain"
64 | sloClass: "test1"
65 |
66 | sloEventProducer:
67 | rulesFiles:
68 | - "slo_rules.yaml"
69 |
70 | prometheusExporter:
71 | metricName: "slo_events_total"
72 | maximumUniqueEventKeys: 1000
73 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
74 | labelNames:
75 | result: "result"
76 | sloDomain: "slo_domain"
77 | sloClass: "slo_class"
78 | sloApp: "slo_app"
79 | eventKey: "event_key"
80 |
--------------------------------------------------------------------------------
/examples/nginx_proxy/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: test-domain
4 | failure_conditions:
5 | - operator: numberIsEqualOrHigherThan
6 | key: statusCode
7 | value: 500
8 | - operator: isMatchingRegexp
9 | key: sloResult
10 | value: "[fF]ail(ure)?"
11 | additional_metadata:
12 | slo_type: availability
13 | slo_version: 6
14 |
15 | - slo_matcher:
16 | domain: test-domain
17 | class: critical
18 | failure_conditions:
19 | - operator: numberIsHigherThan
20 | key: requestDuration
21 | value: 0.8
22 | additional_metadata:
23 | slo_version: 6
24 | slo_type: latency90
25 | percentile: 90
26 | le: 0.8
27 |
--------------------------------------------------------------------------------
/examples/prometheus/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus SLO example
2 |
3 | This example shows a simple configuration of slo-exporter using
4 | [`prometheus_ingester`](/docs/modules/prometheus_ingester.md)
5 | as a source of data in order to compute SLO of Prometheus itself.
6 |
7 | #### How to run it
8 | In root of the repo
9 | ```bash
10 | make build
11 | cd examples/prometheus
12 | ../../slo_exporter --config-file slo_exporter.yaml
13 | ```
14 | Once started see http://localhost:8080/metrics.
15 |
16 | ## How SLO is computed
17 |
18 | Prometheus has few basic functionalities which should be covered by its SLOs.
19 |
20 | ## Observed SLO types
21 | #### `api_availability`
22 | Uses Prometheus counter `prometheus_http_requests_total` which has label `handler` and `code`.
23 | An event is generated for every observed increase of this metric with the corresponding metadata using
24 | [`counter_increase` query type of `prometheus_ingester` module ](/docs/modules/prometheus_ingester.md#type-counter_increase).
25 | Label `handler` holds information which endpoint has been called. That is used as an event key to classify its importance (SLO class).
26 | Label `code` contains resulting status code of the request. That is used to decide if the event was success (<500) or fail(>=500).
27 |
28 | #### `api_latency`
29 | Uses Prometheus histogram `prometheus_http_request_duration_seconds_bucket` which tracks response latency distribution of all requests.
30 | It generates event for every request with the corresponding metadata and additional values from the `le` labels about
31 | upper and lower boundary of the bucket the event falls into using
32 | [the `prometheus_ingester` modules `histogram_increase` query type](/docs/modules/prometheus_ingester.md#type-histogram_increase).
33 | Then the lower boundary, holding the minimum duration the request took, is compared with the latency threshold to decide is request was success or fail.
34 | The latency has different thresholds for distinct SLO classes, so the rules are separate for each of them.
35 |
--------------------------------------------------------------------------------
/examples/prometheus/exact_events_classification.csv:
--------------------------------------------------------------------------------
1 | # Critical endpoints which should be fast and highly available.
2 | ui,prometheus,critical,"/"
3 | ui,prometheus,critical,"/graph"
4 | ui,prometheus,critical,"/metrics"
5 |
6 | # Less important endpoints with low latency expected.
7 | ui,prometheus,high_fast,"/flags"
8 | ui,prometheus,high_fast,"/static/*filepath"
9 | ui,prometheus,high_fast,"/version"
10 | ui,prometheus,high_fast,"/config"
11 |
12 | # Endpoints which should be highly available but latency can be higher depending on amount of rendered data.
13 | ui,prometheus,high_slow,"/federate"
14 | ui,prometheus,high_slow,"/targets"
15 | ui,prometheus,high_slow,"/service-discovery"
16 | ui,prometheus,high_slow,"/rules"
17 | ui,prometheus,high_slow,"/alerts"
18 | ui,prometheus,high_slow,"/consoles/*filepath"
19 |
--------------------------------------------------------------------------------
/examples/prometheus/regexp_events_classification.csv:
--------------------------------------------------------------------------------
1 | # Endpoints which should be highly available but latency can be higher depending on amount of requested data.
2 | api,prometheus,high_slow,"/api/.*"
3 |
--------------------------------------------------------------------------------
/examples/prometheus/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8080"
2 |
3 | pipeline:
4 | [
5 | "prometheusIngester",
6 | "relabel",
7 | "eventKeyGenerator",
8 | "dynamicClassifier",
9 | "sloEventProducer",
10 | "prometheusExporter",
11 | ]
12 |
13 | modules:
14 | prometheusIngester:
15 | apiUrl: "http://demo.robustperception.io:9090"
16 | httpHeaders:
17 | - name: X-Scope-OrgID
18 | value: "myOrganization"
19 | # - name: Authorization
20 | # valueFromEnv:
21 | # name: "SLO_EXPORTER_AUTH_TOKEN"
22 | # valuePrefix: "Bearer "
23 | queryTimeout: 30s
24 | queries:
25 | # Generate events from counter for every HTTP request with status code for availability SLO.
26 | - type: counter_increase
27 | query: "prometheus_http_requests_total"
28 | interval: 30s
29 | offset: 5m
30 | additionalLabels:
31 | event_type: http_request_result
32 |
33 | # Generate events from histogram for every HTTP request for latency SLO.
34 | - type: histogram_increase
35 | query: "prometheus_http_request_duration_seconds_bucket"
36 | interval: 30s
37 | additionalLabels:
38 | event_type: http_request_latency
39 |
40 | relabel:
41 | eventRelabelConfigs:
42 | # Drop all events on the `/debug` and `/new` handler since we do not want to define SLO on those.
43 | - source_labels: ["handler"]
44 | regex: "/(debug|new).*"
45 | action: drop
46 |
47 | eventKeyGenerator:
48 | metadataKeys:
49 | - handler
50 |
51 | dynamicClassifier:
52 | exactMatchesCsvFiles:
53 | - "exact_events_classification.csv"
54 | regexpMatchesCsvFiles:
55 | - "regexp_events_classification.csv"
56 |
57 | sloEventProducer:
58 | rulesFiles:
59 | - "slo_rules.yaml"
60 |
61 | prometheusExporter:
62 | maximumUniqueEventKeys: 100
63 |
--------------------------------------------------------------------------------
/examples/prometheus/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 |
3 | # HTTP requests availability rules
4 | - metadata_matcher:
5 | - key: event_type
6 | operator: isEqualTo
7 | value: http_request_result
8 | failure_conditions:
9 | # Requests to api are failed events if resulted with status code higher or equal to 500
10 | - key: code
11 | operator: numberIsEqualOrHigherThan
12 | value: 500
13 | additional_metadata:
14 | slo_type: api_availability
15 | slo_version: 1
16 |
17 | # HTTP requests latency rules
18 | - metadata_matcher:
19 | - key: event_type
20 | operator: isEqualTo
21 | value: http_request_latency
22 | slo_matcher:
23 | class: critical
24 | failure_conditions:
25 | - operator: numberIsHigherThan
26 | key: prometheusHistogramMinValue
27 | value: 0.1
28 | additional_metadata:
29 | slo_type: api_latency
30 | slo_version: 1
31 |
32 | - metadata_matcher:
33 | - key: event_type
34 | operator: isEqualTo
35 | value: http_request_latency
36 | slo_matcher:
37 | class: high_fast
38 | failure_conditions:
39 | - operator: numberIsHigherThan
40 | key: prometheusHistogramMinValue
41 | value: 0.2
42 | additional_metadata:
43 | slo_type: api_latency
44 | slo_version: 1
45 |
46 | - metadata_matcher:
47 | - key: event_type
48 | operator: isEqualTo
49 | value: http_request_latency
50 | slo_matcher:
51 | class: high_slow
52 | failure_conditions:
53 | - operator: numberIsHigherThan
54 | key: prometheusHistogramMinValue
55 | value: 3
56 | additional_metadata:
57 | slo_type: api_latency
58 | slo_version: 1
59 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/seznam/slo-exporter
2 |
3 | go 1.23
4 |
5 | require (
6 | github.com/envoyproxy/go-control-plane v0.13.1
7 | github.com/go-kit/kit v0.13.0
8 | github.com/go-test/deep v1.0.6
9 | github.com/golang/protobuf v1.5.4
10 | github.com/gorilla/mux v1.8.1
11 | github.com/grafana/loki v1.6.2-0.20211108122114-f61a4d2612d8
12 | github.com/grpc-ecosystem/go-grpc-prometheus v1.2.1-0.20191002090509-6af20e3a5340
13 | github.com/hashicorp/go-multierror v1.1.1
14 | github.com/hpcloud/tail v1.0.1-0.20180514194441-a1dbeea552b7
15 | github.com/iancoleman/strcase v0.3.0
16 | github.com/prometheus/client_golang v1.20.5
17 | github.com/prometheus/client_model v0.6.1
18 | github.com/prometheus/common v0.60.1
19 | github.com/prometheus/prometheus v1.8.2-0.20211011171444-354d8d2ecfac
20 | github.com/segmentio/kafka-go v0.4.47
21 | github.com/sirupsen/logrus v1.9.3
22 | github.com/spf13/viper v1.19.0
23 | github.com/stretchr/testify v1.9.0
24 | go.uber.org/atomic v1.11.0
25 | golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c
26 | gonum.org/v1/gonum v0.15.1
27 | google.golang.org/grpc v1.68.0
28 | google.golang.org/protobuf v1.35.1
29 | gopkg.in/alecthomas/kingpin.v2 v2.2.6
30 | gopkg.in/yaml.v2 v2.4.0
31 | )
32 |
33 | require (
34 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
35 | github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b // indirect
36 | github.com/beorn7/perks v1.0.1 // indirect
37 | github.com/cespare/xxhash/v2 v2.3.0 // indirect
38 | github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect
39 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
40 | github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect
41 | github.com/fsnotify/fsnotify v1.8.0 // indirect
42 | github.com/go-kit/log v0.2.1 // indirect
43 | github.com/go-logfmt/logfmt v0.6.0 // indirect
44 | github.com/gogo/googleapis v1.4.1 // indirect
45 | github.com/hashicorp/errwrap v1.1.0 // indirect
46 | github.com/hashicorp/hcl v1.0.0 // indirect
47 | github.com/json-iterator/go v1.1.12 // indirect
48 | github.com/klauspost/compress v1.17.11 // indirect
49 | github.com/kylelemons/godebug v1.1.0 // indirect
50 | github.com/magiconair/properties v1.8.7 // indirect
51 | github.com/mitchellh/mapstructure v1.5.0 // indirect
52 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
53 | github.com/modern-go/reflect2 v1.0.2 // indirect
54 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
55 | github.com/pelletier/go-toml/v2 v2.2.3 // indirect
56 | github.com/pierrec/lz4/v4 v4.1.21 // indirect
57 | github.com/pkg/errors v0.9.1 // indirect
58 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
59 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
60 | github.com/prometheus/procfs v0.15.1 // indirect
61 | github.com/sagikazarmark/locafero v0.6.0 // indirect
62 | github.com/sagikazarmark/slog-shim v0.1.0 // indirect
63 | github.com/sourcegraph/conc v0.3.0 // indirect
64 | github.com/spf13/afero v1.11.0 // indirect
65 | github.com/spf13/cast v1.7.0 // indirect
66 | github.com/spf13/pflag v1.0.5 // indirect
67 | github.com/subosito/gotenv v1.6.0 // indirect
68 | go.uber.org/multierr v1.11.0 // indirect
69 | golang.org/x/net v0.30.0 // indirect
70 | golang.org/x/sys v0.27.0 // indirect
71 | golang.org/x/text v0.20.0 // indirect
72 | google.golang.org/genproto/googleapis/rpc v0.0.0-20241104194629-dd2ea8efbc28 // indirect
73 | gopkg.in/fsnotify/fsnotify.v1 v1.4.7 // indirect
74 | gopkg.in/ini.v1 v1.67.0 // indirect
75 | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
76 | gopkg.in/yaml.v3 v3.0.1 // indirect
77 | )
78 |
--------------------------------------------------------------------------------
/grafana_dashboards/README.md:
--------------------------------------------------------------------------------
1 | # Grafana dashboards
2 | ### [slo-exporter dashboard](./slo_exporter.json)
3 | Dashboard visualising application metrics of slo-exporter itself.
4 |
--------------------------------------------------------------------------------
/grafana_dashboards/SLO_detailed.json:
--------------------------------------------------------------------------------
1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_detailed.json
--------------------------------------------------------------------------------
/grafana_dashboards/SLO_domains_overview.json:
--------------------------------------------------------------------------------
1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_domains_overview.json
--------------------------------------------------------------------------------
/grafana_dashboards/SLO_drilldown.json:
--------------------------------------------------------------------------------
1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_drilldown.json
--------------------------------------------------------------------------------
/grafana_dashboards/slo_exporter.json:
--------------------------------------------------------------------------------
1 | ../examples/all_in_one/grafana/provisioning/dashboards/slo_exporter.json
--------------------------------------------------------------------------------
/kubernetes/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes manifests
2 |
3 | These serve as an example. You will probably want to use kustomize or some other configuration/deployment tool.
4 |
5 | ## Workload type
6 |
7 | We recommend using a StatefulSet instead of deployment to mitigate high churn in long term metrics - statefulset' stable pod name format (which is propagated to Prometheus' instance label) makes it easier for Prometheus to calculate the SLO over long time periods.
8 |
9 | ## Configuration
10 |
11 | We recommend to either building own docker image based on the upstream one with the configuration baked in or including configuration as a versioned configmap(s), in order to simplify rollbacks.
12 |
13 | In this example we use it without the versioning just for simplification.
14 |
15 |
--------------------------------------------------------------------------------
/kubernetes/slo-exporter-configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: slo-exporter-config
5 | labels:
6 | app: slo-exporter
7 | data:
8 | slo_exporter.yaml: |
9 | webServerListenAddress: "0.0.0.0:8001"
10 | maximumGracefulShutdownDuration: "10s"
11 | afterPipelineShutdownDelay: "1s"
12 |
13 | pipeline: ["envoyAccessLogServer", "relabel", "eventKeyGenerator", "metadataClassifier", "sloEventProducer", "prometheusExporter"]
14 |
15 | modules:
16 |
17 | envoyAccessLogServer: {}
18 |
19 | relabel:
20 | eventRelabelConfigs:
21 | # Drop events with unwanted status codes
22 | - source_labels: ["responseCode"]
23 | regex: "30[12]|40[045]|411|408|499"
24 | action: drop
25 |
26 | eventKeyGenerator:
27 | filedSeparator: ":"
28 | metadataKeys:
29 | - requestMethod
30 | - path
31 |
32 | metadataClassifier:
33 | sloDomainMetadataKey: http_slo-domain
34 | sloClassMetadataKey: http_slo-class
35 | sloAppMetadataKey: http_slo-app
36 |
37 | sloEventProducer:
38 | rulesFiles:
39 | - "slo_rules.yaml"
40 |
41 | prometheusExporter:
42 | metricName: "slo_events_total"
43 | labelNames:
44 | result: "result"
45 | sloDomain: "slo_domain"
46 | sloClass: "slo_class"
47 | sloApp: "slo_app"
48 | eventKey: "event_key"
49 |
50 | slo_rules.yaml: |
51 | rules:
52 | - slo_matcher:
53 | domain: example-domain
54 | failure_conditions:
55 | - operator: numberIsEqualOrHigherThan
56 | key: responseCode
57 | value: 500
58 | additional_metadata:
59 | slo_type: availability
60 | slo_version: 1
61 | namespace: test
62 |
63 | - slo_matcher:
64 | domain: example-domain
65 | class: critical
66 | failure_conditions:
67 | - operator: durationIsHigherThan
68 | key: timeToLastDownstreamTxByte
69 | value: 10ms
70 | additional_metadata:
71 | slo_version: 1
72 | slo_type: latency90
73 | percentile: 90
74 | le: 0.01
75 | namespace: test
76 |
77 | - slo_matcher:
78 | domain: example-domain
79 | class: critical
80 | failure_conditions:
81 | - operator: durationIsHigherThan
82 | key: timeToLastDownstreamTxByte
83 | value: 50ms
84 | additional_metadata:
85 | slo_version: 1
86 | slo_type: latency99
87 | percentile: 99
88 | le: 0.05
89 | namespace: test
90 |
--------------------------------------------------------------------------------
/kubernetes/slo-exporter-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: slo-exporter
5 | spec:
6 | type: ClusterIP
7 | ports:
8 | - name: grpc-logging
9 | port: 18090
10 | protocol: TCP
11 | targetPort: 18090
12 | selector:
13 | app: slo-exporter
14 |
--------------------------------------------------------------------------------
/kubernetes/slo-exporter-statefulset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: StatefulSet
3 | metadata:
4 | name: slo-exporter
5 | spec:
6 | replicas: 2
7 | serviceName: slo-exporter
8 | podManagementPolicy: Parallel
9 | selector:
10 | matchLabels:
11 | app: slo-exporter
12 | template:
13 | metadata:
14 | labels:
15 | app: slo-exporter
16 | name: slo-exporter
17 | annotations:
18 | prometheus.metrics.scrape: "true"
19 | prometheus.metrics.port: "8080"
20 | prometheus.metrics.path: "/metrics"
21 | spec:
22 | containers:
23 | - name: slo-exporter
24 | image: seznam/slo-exporter:latest
25 | workingDir: "/slo-exporter-config"
26 | args:
27 | - --config-file=slo_exporter.yaml
28 | resources:
29 | requests:
30 | cpu: "0.1"
31 | memory: "100Mi"
32 | limits:
33 | cpu: "0.5"
34 | memory: "250Mi"
35 | ports:
36 | - containerPort: 8080
37 | protocol: TCP
38 | - containerPort: 18090
39 | protocol: TCP
40 | volumeMounts:
41 | - name: slo-exporter-config
42 | mountPath: "/slo-exporter-config"
43 | volumes:
44 | - name: slo-exporter-config
45 | configMap:
46 | name: slo-exporter-config
47 |
--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
1 | package config
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "time"
7 |
8 | "github.com/sirupsen/logrus"
9 | "github.com/spf13/viper"
10 | )
11 |
12 | func New(logger logrus.FieldLogger) *Config {
13 | return &Config{logger: logger}
14 | }
15 |
16 | type Config struct {
17 | Pipeline []string
18 | LogLevel string
19 | WebServerListenAddress string
20 | MaximumGracefulShutdownDuration time.Duration
21 | AfterPipelineShutdownDelay time.Duration
22 | Modules map[string]interface{}
23 | logger logrus.FieldLogger
24 | }
25 |
26 | func (c *Config) setupViper() {
27 | viper.SetConfigType("yaml")
28 | viper.SetEnvPrefix("slo_exporter")
29 | viper.AutomaticEnv()
30 | }
31 |
32 | func (c *Config) LoadFromFile(path string) error {
33 | c.setupViper()
34 | viper.SetDefault("LogLevel", "info")
35 | viper.SetDefault("WebServerListenAddress", "0.0.0.0:8080")
36 | viper.SetDefault("MaximumGracefulShutdownDuration", 20*time.Second)
37 | viper.SetDefault("AfterPipelineShutdownDelay", 0*time.Second)
38 | yamlFile, err := os.Open(path)
39 | if err != nil {
40 | return fmt.Errorf("failed to open configuration file: %w", err)
41 | }
42 | if err := viper.ReadConfig(yamlFile); err != nil {
43 | return fmt.Errorf("failed to load configuration file: %w", err)
44 | }
45 | if err := viper.UnmarshalExact(c); err != nil {
46 | return fmt.Errorf("failed to unmarshall configuration file: %w", err)
47 | }
48 | return nil
49 | }
50 |
51 | func (c *Config) ModuleConfig(moduleName string) (*viper.Viper, error) {
52 | subConfig := viper.Sub("modules." + moduleName)
53 | if subConfig == nil {
54 | return nil, fmt.Errorf("missing configuration for module %s", moduleName)
55 | }
56 | subConfig.SetEnvPrefix("slo_exporter_" + moduleName)
57 | subConfig.AutomaticEnv()
58 | return subConfig, nil
59 | }
60 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/matcher.go:
--------------------------------------------------------------------------------
1 | package dynamic_classifier
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/seznam/slo-exporter/pkg/event"
7 | )
8 |
9 | type matcherType string
10 |
11 | type matcher interface {
12 | getType() matcherType
13 | set(key string, classification *event.SloClassification) error
14 | get(key string) (*event.SloClassification, error)
15 | dumpCSV(w io.Writer) error
16 | }
17 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/matcher_test.go:
--------------------------------------------------------------------------------
1 | package dynamic_classifier
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "os"
7 | "path/filepath"
8 | "reflect"
9 | "regexp"
10 | "testing"
11 |
12 | "github.com/seznam/slo-exporter/pkg/event"
13 | "github.com/sirupsen/logrus"
14 | "github.com/stretchr/testify/assert"
15 | )
16 |
17 | func newTestSloClassification() *event.SloClassification {
18 | return &event.SloClassification{
19 | Domain: "test-domain",
20 | App: "test-app",
21 | Class: "test-class",
22 | }
23 | }
24 |
25 | func TestMatcher(t *testing.T) {
26 | logger := logrus.New()
27 | cases := []struct {
28 | matcher matcher
29 | key string
30 | value *event.SloClassification
31 | wantedKey string
32 | wantedValue *event.SloClassification
33 | setErr string
34 | getErr string
35 | }{
36 | {newMemoryExactMatcher(logger), "test", newTestSloClassification(), "test", newTestSloClassification(), "", ""},
37 | {newMemoryExactMatcher(logger), "", newTestSloClassification(), "", newTestSloClassification(), "", ""},
38 | {newMemoryExactMatcher(logger), "test", newTestSloClassification(), "aaa", nil, "", ""},
39 | {newRegexpMatcher(logger), ".*", newTestSloClassification(), "aaa", newTestSloClassification(), "", ""},
40 | {newRegexpMatcher(logger), ".*****", newTestSloClassification(), "aaa", newTestSloClassification(), "failed to create new regexp endpoint classification: error parsing regexp: invalid nested repetition operator: `**`", ""},
41 | }
42 |
43 | for _, v := range cases {
44 | err := v.matcher.set(v.key, v.value)
45 | if err != nil && v.setErr != "" {
46 | assert.EqualError(t, err, v.setErr)
47 | return
48 | }
49 | value, err := v.matcher.get(v.wantedKey)
50 | if err != nil && v.setErr != "" {
51 | assert.EqualError(t, err, v.getErr)
52 | return
53 | }
54 |
55 | if !reflect.DeepEqual(value, v.wantedValue) {
56 | t.Errorf("Get returned non-expected value %+v != %+v", value, v.wantedValue)
57 | }
58 |
59 | }
60 | }
61 |
62 | func testDumpCSV(t *testing.T, matcher matcher) {
63 | expectedDataFilename := filepath.Join("testdata", t.Name()+".golden")
64 | expectedDataBytes, err := os.ReadFile(expectedDataFilename)
65 | if err != nil {
66 | t.Fatal(err)
67 | }
68 |
69 | var dataBytes []byte
70 | dataBuffer := bytes.NewBuffer(dataBytes)
71 | err = matcher.dumpCSV(dataBuffer)
72 | assert.NoError(t, err)
73 | assert.EqualValues(t, expectedDataBytes, dataBuffer.Bytes(),
74 | fmt.Sprintf("expected:\n%s\nactual:\n%s", string(expectedDataBytes), dataBuffer.String()))
75 | }
76 |
77 | func TestMatcherExactDumpCSV(t *testing.T) {
78 | matcher := newMemoryExactMatcher(logrus.New())
79 | matcher.exactMatches["test-endpoint"] = newTestSloClassification()
80 | testDumpCSV(t, matcher)
81 | }
82 |
83 | func TestMatcherRegexpDumpCSV(t *testing.T) {
84 | matcher := newRegexpMatcher(logrus.New())
85 | matcher.matchers = append(matcher.matchers,
86 | ®expSloClassification{
87 | regexpCompiled: regexp.MustCompile(".*"),
88 | classification: newTestSloClassification(),
89 | },
90 | )
91 | testDumpCSV(t, matcher)
92 | }
93 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/memory_exact_matcher.go:
--------------------------------------------------------------------------------
1 | package dynamic_classifier
2 |
3 | import (
4 | "encoding/csv"
5 | "fmt"
6 | "io"
7 | "sync"
8 |
9 | "github.com/prometheus/client_golang/prometheus"
10 | "github.com/seznam/slo-exporter/pkg/event"
11 | "github.com/sirupsen/logrus"
12 | )
13 |
14 | const exactMatcherType = "exact"
15 |
16 | type memoryExactMatcher struct {
17 | exactMatches map[string]*event.SloClassification
18 | mtx sync.RWMutex
19 | logger logrus.FieldLogger
20 | }
21 |
22 | // newMemoryExactMatcher returns instance of memoryCache.
23 | func newMemoryExactMatcher(logger logrus.FieldLogger) *memoryExactMatcher {
24 | exactMatches := map[string]*event.SloClassification{}
25 | return &memoryExactMatcher{
26 | exactMatches: exactMatches,
27 | mtx: sync.RWMutex{},
28 | logger: logger,
29 | }
30 | }
31 |
32 | // set sets endpoint classification in cache.
33 | func (c *memoryExactMatcher) set(key string, classification *event.SloClassification) error {
34 | timer := prometheus.NewTimer(matcherOperationDurationSeconds.WithLabelValues("set", exactMatcherType))
35 | defer timer.ObserveDuration()
36 | c.mtx.Lock()
37 | defer c.mtx.Unlock()
38 |
39 | c.exactMatches[key] = classification
40 | return nil
41 | }
42 |
43 | // get gets endpoint classification from cache.
44 | func (c *memoryExactMatcher) get(key string) (*event.SloClassification, error) {
45 | timer := prometheus.NewTimer(matcherOperationDurationSeconds.WithLabelValues("get", exactMatcherType))
46 | defer timer.ObserveDuration()
47 | c.mtx.RLock()
48 | defer c.mtx.RUnlock()
49 |
50 | value := c.exactMatches[key]
51 | return value, nil
52 | }
53 |
54 | func (c *memoryExactMatcher) getType() matcherType {
55 | return exactMatcherType
56 | }
57 |
58 | func (c *memoryExactMatcher) dumpCSV(w io.Writer) error {
59 | c.mtx.RLock()
60 | defer c.mtx.RUnlock()
61 |
62 | buffer := csv.NewWriter(w)
63 | defer buffer.Flush()
64 | for k, v := range c.exactMatches {
65 | err := buffer.Write([]string{v.Domain, v.App, v.Class, k})
66 | if err != nil {
67 | errorsTotal.WithLabelValues("dumpExactMatchersToCSV").Inc()
68 | return fmt.Errorf("failed to dump csv: %w", err)
69 | }
70 | buffer.Flush()
71 | }
72 | return nil
73 | }
74 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/regexp_matcher.go:
--------------------------------------------------------------------------------
1 | package dynamic_classifier
2 |
3 | import (
4 | "encoding/csv"
5 | "fmt"
6 | "io"
7 | "regexp"
8 | "sync"
9 |
10 | "github.com/prometheus/client_golang/prometheus"
11 | "github.com/seznam/slo-exporter/pkg/event"
12 | "github.com/sirupsen/logrus"
13 | )
14 |
15 | const regexpMatcherType = "regexp"
16 |
17 | // regexpSloClassification encapsulates combination of regexp and endpoint classification.
18 | type regexpSloClassification struct {
19 | regexpCompiled *regexp.Regexp
20 | classification *event.SloClassification
21 | }
22 |
23 | // regexpMatcher is list of endpoint classifications.
24 | type regexpMatcher struct {
25 | matchers []*regexpSloClassification
26 | mtx sync.RWMutex
27 | logger logrus.FieldLogger
28 | }
29 |
30 | // newRegexpMatcher returns new instance of regexpMatcher.
31 | func newRegexpMatcher(logger logrus.FieldLogger) *regexpMatcher {
32 | return ®expMatcher{
33 | mtx: sync.RWMutex{},
34 | logger: logger,
35 | }
36 | }
37 |
38 | // newRegexSloClassification returns new instance of regexpSloClassification.
39 | func newRegexSloClassification(regexpString string, classification *event.SloClassification) (*regexpSloClassification, error) {
40 | compiledMatcher, err := regexp.Compile(regexpString)
41 | if err != nil {
42 | return nil, fmt.Errorf("failed to create new regexp endpoint classification: %w", err)
43 | }
44 | rec := ®expSloClassification{
45 | regexpCompiled: compiledMatcher,
46 | classification: classification,
47 | }
48 | return rec, nil
49 | }
50 |
51 | // set adds new endpoint classification regexp to list.
52 | func (rm *regexpMatcher) set(regexpString string, classification *event.SloClassification) error {
53 | timer := prometheus.NewTimer(matcherOperationDurationSeconds.WithLabelValues("set", regexpMatcherType))
54 | defer timer.ObserveDuration()
55 | rm.mtx.Lock()
56 | defer rm.mtx.Unlock()
57 |
58 | regexpClassification, err := newRegexSloClassification(regexpString, classification)
59 | if err != nil {
60 | return err
61 | }
62 | rm.matchers = append(rm.matchers, regexpClassification)
63 | return nil
64 | }
65 |
66 | // get gets through all regexes and returns first endpoint classification which matches it.
67 | func (rm *regexpMatcher) get(key string) (*event.SloClassification, error) {
68 | timer := prometheus.NewTimer(matcherOperationDurationSeconds.WithLabelValues("get", regexpMatcherType))
69 | defer timer.ObserveDuration()
70 | rm.mtx.RLock()
71 | defer rm.mtx.RUnlock()
72 |
73 | var classification *event.SloClassification
74 | for _, r := range rm.matchers {
75 | // go next if no match
76 | if !r.regexpCompiled.MatchString(key) {
77 | continue
78 | }
79 |
80 | // if already classified, but matches next regex
81 | if classification != nil {
82 | rm.logger.Warnf("key '%s' is matched by another regexp: '%s'\n", key, r.regexpCompiled.String())
83 | continue
84 | }
85 | classification = r.classification
86 | }
87 | return classification, nil
88 | }
89 |
90 | func (rm *regexpMatcher) getType() matcherType {
91 | return regexpMatcherType
92 | }
93 |
94 | func (rm *regexpMatcher) dumpCSV(w io.Writer) error {
95 | rm.mtx.RLock()
96 | defer rm.mtx.RUnlock()
97 |
98 | buffer := csv.NewWriter(w)
99 | defer buffer.Flush()
100 | for _, v := range rm.matchers {
101 | err := buffer.Write([]string{v.classification.Domain, v.classification.App, v.classification.Class, v.regexpCompiled.String()})
102 | if err != nil {
103 | errorsTotal.WithLabelValues("dumpRegexpMatchersToCSV").Inc()
104 | return fmt.Errorf("failed to dump csv: %w", err)
105 | }
106 | buffer.Flush()
107 | }
108 | return nil
109 | }
110 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestClassificationByExactMatches.golden:
--------------------------------------------------------------------------------
1 | # Test comment
2 | test-domain,test-app,test-class,"GET:/testing-endpoint"
3 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestClassificationByRegexpMatches.golden:
--------------------------------------------------------------------------------
1 | # Test comment
2 | test-domain,test-app,test-class,"/api/test/.*"
3 | test-domain,test-app,test-class-all,"/api/.*"
4 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestLoadExactMatchesFromMultipleCSV.golden:
--------------------------------------------------------------------------------
1 | test-domain,test-app,test-class,"GET:/testing-endpoint"
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestLoadRegexpMatchesFromMultipleCSV.golden:
--------------------------------------------------------------------------------
1 | test-domain,test-app,test-class,".*"
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestMatcherExactDumpCSV.golden:
--------------------------------------------------------------------------------
1 | test-domain,test-app,test-class,test-endpoint
2 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/TestMatcherRegexpDumpCSV.golden:
--------------------------------------------------------------------------------
1 | test-domain,test-app,test-class,.*
2 |
--------------------------------------------------------------------------------
/pkg/dynamic_classifier/testdata/Test_DynamicClassifier_Classify_OverridesCacheFromConfig.golden:
--------------------------------------------------------------------------------
1 | domain,app,class,GET:/testing-endpoint
2 |
--------------------------------------------------------------------------------
/pkg/envoy_access_log_server/util.go:
--------------------------------------------------------------------------------
1 | package envoy_access_log_server
2 |
3 | import (
4 | "fmt"
5 |
6 | pbduration "github.com/golang/protobuf/ptypes/duration"
7 | )
8 |
9 | // Returns deterministic string representation of the given duration - ns.
10 | func pbDurationDeterministicString(d *pbduration.Duration) (string, error) {
11 | if d == nil {
12 | return "", fmt.Errorf(" duration given")
13 | }
14 | if !d.IsValid() {
15 | return "", fmt.Errorf("invalid duration given: %s", d)
16 | }
17 | return fmt.Sprint(d.AsDuration().Nanoseconds()) + "ns", nil
18 | }
19 |
--------------------------------------------------------------------------------
/pkg/event/raw.go:
--------------------------------------------------------------------------------
1 | package event
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/seznam/slo-exporter/pkg/stringmap"
7 | )
8 |
9 | // Raw represents single event as received by an EventsProcessor instance.
10 | type Raw struct {
11 | Metadata stringmap.StringMap
12 | SloClassification *SloClassification
13 | Quantity float64
14 | }
15 |
16 | const (
17 | eventKeyMetadataKey = "__eventKey"
18 | )
19 |
20 | func (r *Raw) EventKey() string {
21 | return r.Metadata[eventKeyMetadataKey]
22 | }
23 |
24 | func (r *Raw) SetEventKey(k string) {
25 | if r.Metadata == nil {
26 | r.Metadata = make(stringmap.StringMap)
27 | }
28 | r.Metadata[eventKeyMetadataKey] = k
29 | }
30 |
31 | // UpdateSLOClassification updates SloClassification field.
32 | func (r *Raw) UpdateSLOClassification(classification *SloClassification) {
33 | r.SloClassification = classification
34 | }
35 |
36 | // IsClassified check if all SloClassification fields are set.
37 | func (r *Raw) IsClassified() bool {
38 | if r.SloClassification != nil &&
39 | r.SloClassification.Domain != "" &&
40 | r.SloClassification.App != "" &&
41 | r.SloClassification.Class != "" {
42 |
43 | return true
44 | }
45 | return false
46 | }
47 |
48 | func (r Raw) GetSloMetadata() stringmap.StringMap {
49 | if r.SloClassification == nil {
50 | return nil
51 | }
52 | metadata := r.SloClassification.GetMetadata()
53 | return metadata
54 | }
55 |
56 | func (r Raw) GetSloClassification() *SloClassification {
57 | return r.SloClassification
58 | }
59 |
60 | func (r Raw) String() string {
61 | return fmt.Sprintf("key: %s, quantity: %f, metadata: %s, classification: %s", r.EventKey(), r.Quantity, r.Metadata, r.GetSloMetadata())
62 | }
63 |
--------------------------------------------------------------------------------
/pkg/event/slo.go:
--------------------------------------------------------------------------------
1 | package event
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/seznam/slo-exporter/pkg/stringmap"
7 | )
8 |
9 | type Result string
10 |
11 | func (r Result) String() string {
12 | return string(r)
13 | }
14 |
15 | const (
16 | Success Result = "success"
17 | Fail Result = "fail"
18 | )
19 |
20 | var PossibleResults = []Result{Success, Fail}
21 |
22 | type Slo struct {
23 | // same value as in source event Raw.EventKey()
24 | Key string
25 | Result Result
26 |
27 | Domain string
28 | Class string
29 | App string
30 |
31 | Metadata stringmap.StringMap
32 | Quantity float64
33 |
34 | OriginalEvent Raw
35 | }
36 |
37 | func (s *Slo) IsClassified() bool {
38 | return s.Domain != "" && s.Class != "" && s.App != ""
39 | }
40 |
41 | func (s *Slo) String() string {
42 | return fmt.Sprintf("SLO event %q of domain: %q, class: %q, app: %q with metadata: %+v", s.Key, s.Domain, s.Class, s.App, s.Metadata)
43 | }
44 |
45 | func (s Slo) Copy() Slo {
46 | return Slo{
47 | Key: s.Key,
48 | Result: s.Result,
49 | Domain: s.Domain,
50 | Class: s.Class,
51 | App: s.App,
52 | Metadata: s.Metadata.Copy(),
53 | OriginalEvent: s.OriginalEvent,
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/pkg/event/slo_classification.go:
--------------------------------------------------------------------------------
1 | package event
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/seznam/slo-exporter/pkg/stringmap"
7 | )
8 |
9 | type SloClassification struct {
10 | Domain string
11 | App string
12 | Class string
13 | }
14 |
15 | func (sc *SloClassification) Matches(other SloClassification) bool {
16 | if sc.Domain != "" && (sc.Domain != other.Domain) {
17 | return false
18 | }
19 | if sc.Class != "" && (sc.Class != other.Class) {
20 | return false
21 | }
22 | if sc.App != "" && (sc.App != other.App) {
23 | return false
24 | }
25 | return true
26 | }
27 |
28 | func (sc *SloClassification) GetMetadata() stringmap.StringMap {
29 | return stringmap.StringMap{
30 | "slo_domain": sc.Domain,
31 | "slo_class": sc.Class,
32 | "app": sc.App,
33 | }
34 | }
35 |
36 | func (sc *SloClassification) Copy() SloClassification {
37 | return SloClassification{
38 | Domain: sc.Domain,
39 | App: sc.App,
40 | Class: sc.Class,
41 | }
42 | }
43 |
44 | func (sc *SloClassification) String() string {
45 | return fmt.Sprintf("%s:%s:%s", sc.Domain, sc.App, sc.Class)
46 | }
47 |
--------------------------------------------------------------------------------
/pkg/event_key_generator/event_key_generator_test.go:
--------------------------------------------------------------------------------
1 | package event_key_generator
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/seznam/slo-exporter/pkg/stringmap"
7 | "github.com/sirupsen/logrus"
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func TestEventKeyGenerator_generateEventKey(t *testing.T) {
12 | testCases := []struct {
13 | metadata stringmap.StringMap
14 | config eventKeyGeneratorConfig
15 | result string
16 | }{
17 | {metadata: stringmap.StringMap{"foo": "foo"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{}}, result: ""},
18 | {metadata: stringmap.StringMap{"foo": "foo"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"bar"}}, result: ""},
19 | {metadata: stringmap.StringMap{"foo": "foo"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"foo"}}, result: "foo"},
20 | {metadata: stringmap.StringMap{"foo": "foo", "bar": "bar"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"foo"}}, result: "foo"},
21 | {metadata: stringmap.StringMap{"foo": "foo", "bar": "bar"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"foo", "bar"}}, result: "foo:bar"},
22 | {metadata: stringmap.StringMap{"foo": "foo", "bar": ""}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"foo", "bar"}}, result: "foo:"},
23 | {metadata: stringmap.StringMap{"foo": "foo", "bar": "bar"}, config: eventKeyGeneratorConfig{FiledSeparator: "|", MetadataKeys: []string{"foo", "bar"}}, result: "foo|bar"},
24 | {metadata: stringmap.StringMap{"foo": "foo", "bar": "bar"}, config: eventKeyGeneratorConfig{FiledSeparator: ":", MetadataKeys: []string{"xxx", "bar"}}, result: "bar"},
25 | }
26 | for _, tc := range testCases {
27 | generator, err := NewFromConfig(tc.config, logrus.New())
28 | assert.NoError(t, err)
29 | assert.Equal(t, tc.result, generator.generateEventKey(tc.metadata))
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/pkg/event_metadata_renamer/renamer_test.go:
--------------------------------------------------------------------------------
1 | package event_metadata_renamer
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/sirupsen/logrus"
7 | "github.com/stretchr/testify/assert"
8 | "gopkg.in/yaml.v2"
9 |
10 | "github.com/seznam/slo-exporter/pkg/event"
11 | )
12 |
13 | type testCase struct {
14 | name string
15 | inputEvent *event.Raw
16 | outputEvent *event.Raw
17 | }
18 |
19 | var testCases = []testCase{
20 | {
21 | name: "event with empty metadata",
22 | inputEvent: &event.Raw{Metadata: map[string]string{}},
23 | outputEvent: &event.Raw{Metadata: map[string]string{}},
24 | },
25 | {
26 | name: "attempt to rename key which is not present in the event's metadata",
27 | inputEvent: &event.Raw{Metadata: map[string]string{"sourceX": "bar"}},
28 | outputEvent: &event.Raw{Metadata: map[string]string{"sourceX": "bar"}},
29 | },
30 | {
31 | name: "Destination metadata key already exist (collision)",
32 | inputEvent: &event.Raw{Metadata: map[string]string{"destination": "destinationCollisionNotOverriden"}},
33 | outputEvent: &event.Raw{Metadata: map[string]string{"destination": "destinationCollisionNotOverriden"}},
34 | },
35 | {
36 | name: "valid rename of metadata key",
37 | inputEvent: &event.Raw{Metadata: map[string]string{"source": "bar", "other": "xxx"}},
38 | outputEvent: &event.Raw{Metadata: map[string]string{"destination": "bar", "other": "xxx"}},
39 | },
40 | }
41 |
42 | func TestRelabel_Run(t *testing.T) {
43 | configYaml := `
44 | - source: source
45 | destination: destination
46 | `
47 | var config []renamerConfig
48 | err := yaml.UnmarshalStrict([]byte(configYaml), &config)
49 | if err != nil {
50 | t.Fatal(err)
51 | }
52 | mgr, err := NewFromConfig(config, logrus.New())
53 | if err != nil {
54 | t.Fatal(err)
55 | }
56 |
57 | for _, testCase := range testCases {
58 | t.Run(testCase.name, func(t *testing.T) {
59 | assert.Equal(t, testCase.outputEvent, mgr.renameEventMetadata(testCase.inputEvent))
60 | })
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/pkg/metadata_classifier/metadata_cassifier_test.go:
--------------------------------------------------------------------------------
1 | package metadata_classifier
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/seznam/slo-exporter/pkg/event"
7 | "github.com/seznam/slo-exporter/pkg/stringmap"
8 | "github.com/sirupsen/logrus"
9 | "github.com/stretchr/testify/assert"
10 | )
11 |
12 | func TestMetadataClassifier_generateSloClassification(t *testing.T) {
13 | testCases := []struct {
14 | name string
15 | event event.Raw
16 | config metadataClassifierConfig
17 | result event.SloClassification
18 | }{
19 | {
20 | name: "non classified event with expected metadata is classified as expected",
21 | event: event.Raw{
22 | Metadata: stringmap.StringMap{"domain": "domain", "class": "class", "app": "app"},
23 | SloClassification: &event.SloClassification{Domain: "", Class: "", App: ""},
24 | },
25 | config: metadataClassifierConfig{SloDomainMetadataKey: "domain", SloClassMetadataKey: "class", SloAppMetadataKey: "app", OverrideExistingValues: true},
26 | result: event.SloClassification{Domain: "domain", Class: "class", App: "app"},
27 | },
28 | {
29 | name: "with overwrite enabled, metadata classification has precedence over former event classification",
30 | event: event.Raw{
31 | Metadata: stringmap.StringMap{"domain": "domain", "class": "class", "app": "app"},
32 | SloClassification: &event.SloClassification{Domain: "xxx", Class: "xxx", App: "xxx"},
33 | },
34 | config: metadataClassifierConfig{SloDomainMetadataKey: "domain", SloClassMetadataKey: "class", SloAppMetadataKey: "app", OverrideExistingValues: true},
35 | result: event.SloClassification{Domain: "domain", Class: "class", App: "app"},
36 | },
37 | {
38 | name: "with overwrite disabled, former event classification has precedence over metadata classification",
39 | event: event.Raw{
40 | Metadata: stringmap.StringMap{"domain": "domain", "class": "class", "app": "app"},
41 | SloClassification: &event.SloClassification{Domain: "xxx", Class: "xxx", App: "xxx"},
42 | },
43 | config: metadataClassifierConfig{SloDomainMetadataKey: "domain", SloClassMetadataKey: "class", SloAppMetadataKey: "app", OverrideExistingValues: false},
44 | result: event.SloClassification{Domain: "xxx", Class: "xxx", App: "xxx"},
45 | },
46 | {
47 | name: "if specified key is not found in metadata, original value of classification is left intact",
48 | event: event.Raw{
49 | Metadata: stringmap.StringMap{"domain": "domain", "class": "class"},
50 | SloClassification: &event.SloClassification{Domain: "xxx", Class: "xxx", App: "xxx"},
51 | },
52 | config: metadataClassifierConfig{SloDomainMetadataKey: "domain", SloClassMetadataKey: "class", SloAppMetadataKey: "app", OverrideExistingValues: true},
53 | result: event.SloClassification{Domain: "domain", Class: "class", App: "xxx"},
54 | },
55 | }
56 | for _, tc := range testCases {
57 | t.Run(tc.name, func(t *testing.T) {
58 | generator, err := NewFromConfig(tc.config, logrus.New())
59 | assert.NoError(t, err)
60 | assert.Equal(t, tc.result, generator.generateSloClassification(&tc.event))
61 | })
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/pkg/pipeline/module.go:
--------------------------------------------------------------------------------
1 | package pipeline
2 |
3 | import (
4 | "github.com/gorilla/mux"
5 | "github.com/prometheus/client_golang/prometheus"
6 | "github.com/seznam/slo-exporter/pkg/event"
7 | "github.com/sirupsen/logrus"
8 | "github.com/spf13/viper"
9 | )
10 |
11 | type moduleFactoryFunction func(moduleName string, logger logrus.FieldLogger, conf *viper.Viper) (Module, error)
12 |
13 | type ModuleConstructor func(viperConfig *viper.Viper) (Module, error)
14 |
15 | type EventProcessingDurationObserver interface {
16 | Observe(float64)
17 | }
18 |
19 | type Module interface {
20 | Run()
21 | Stop()
22 | Done() bool
23 | }
24 |
25 | type ObservableModule interface {
26 | RegisterEventProcessingDurationObserver(observer EventProcessingDurationObserver)
27 | }
28 |
29 | type PrometheusInstrumentedModule interface {
30 | Module
31 | RegisterMetrics(rootRegistry prometheus.Registerer, wrappedRegistry prometheus.Registerer) error
32 | }
33 |
34 | type WebInterfaceModule interface {
35 | Module
36 | RegisterInMux(router *mux.Router)
37 | }
38 |
39 | type RawEventIngester interface {
40 | SetInputChannel(chan *event.Raw)
41 | }
42 |
43 | type RawEventIngesterModule interface {
44 | Module
45 | RawEventIngester
46 | }
47 |
48 | type RawEventProducer interface {
49 | OutputChannel() chan *event.Raw
50 | }
51 |
52 | type RawEventProducerModule interface {
53 | Module
54 | RawEventProducer
55 | }
56 |
57 | type SloEventIngester interface {
58 | SetInputChannel(chan *event.Slo)
59 | }
60 |
61 | type SloEventIngesterModule interface {
62 | Module
63 | SloEventIngester
64 | }
65 |
66 | type SloEventProducer interface {
67 | OutputChannel() chan *event.Slo
68 | }
69 |
70 | type SloEventProducerModule interface {
71 | Module
72 | SloEventProducer
73 | }
74 |
75 | type ProcessorModule interface {
76 | Module
77 | RawEventIngester
78 | RawEventProducer
79 | }
80 |
--------------------------------------------------------------------------------
/pkg/pipeline/module_test.go:
--------------------------------------------------------------------------------
1 | package pipeline
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/seznam/slo-exporter/pkg/event"
7 | "github.com/sirupsen/logrus"
8 | "github.com/spf13/viper"
9 | )
10 |
11 | func testModuleFactory(moduleName string, _ logrus.FieldLogger, _ *viper.Viper) (Module, error) {
12 | switch moduleName {
13 | case "testRawIngester":
14 | return testRawIngester{}, nil
15 | case "testRawProducer":
16 | return testRawProducer{}, nil
17 | case "testSloIngester":
18 | return testSloIngester{}, nil
19 | case "testSloProducer":
20 | return testSloProducer{}, nil
21 | default:
22 | return nil, fmt.Errorf("unknown module %s", moduleName)
23 | }
24 | }
25 |
26 | type testRawIngester struct{}
27 |
28 | func (t testRawIngester) Run() {}
29 |
30 | func (t testRawIngester) Stop() {}
31 |
32 | func (t testRawIngester) Done() bool {
33 | return false
34 | }
35 |
36 | func (t testRawIngester) SetInputChannel(chan *event.Raw) {}
37 |
38 | type testRawProducer struct{}
39 |
40 | func (t testRawProducer) Run() {}
41 |
42 | func (t testRawProducer) Stop() {}
43 |
44 | func (t testRawProducer) Done() bool {
45 | return false
46 | }
47 |
48 | func (t testRawProducer) OutputChannel() chan *event.Raw {
49 | return make(chan *event.Raw)
50 | }
51 |
52 | type testSloIngester struct{}
53 |
54 | func (t testSloIngester) Run() {}
55 |
56 | func (t testSloIngester) Stop() {}
57 |
58 | func (t testSloIngester) Done() bool {
59 | return false
60 | }
61 |
62 | func (t testSloIngester) SetInputChannel(chan *event.Slo) {}
63 |
64 | type testSloProducer struct{}
65 |
66 | func (t testSloProducer) Run() {}
67 |
68 | func (t testSloProducer) Stop() {}
69 |
70 | func (t testSloProducer) Done() bool {
71 | return false
72 | }
73 |
74 | func (t testSloProducer) OutputChannel() chan *event.Slo {
75 | return make(chan *event.Slo)
76 | }
77 |
--------------------------------------------------------------------------------
/pkg/prober/prober.go:
--------------------------------------------------------------------------------
1 | package prober
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "net/http"
7 | "sync"
8 |
9 | "github.com/prometheus/client_golang/prometheus"
10 | "github.com/sirupsen/logrus"
11 | )
12 |
13 | var (
14 | ErrDefault = fmt.Errorf("initializing")
15 |
16 | status = prometheus.NewGaugeVec(
17 | prometheus.GaugeOpts{
18 | Name: "probe_status",
19 | Help: "Status of the probes",
20 | },
21 | []string{"probe"},
22 | )
23 | )
24 |
25 | // NewLiveness returns prober to be used as a liveness probe.
26 | func NewLiveness(registry prometheus.Registerer, logger logrus.FieldLogger) (*Prober, error) {
27 | p, err := newProber(registry, logger, "liveness")
28 | if err != nil {
29 | return nil, err
30 | }
31 | p.Ok()
32 | return p, nil
33 | }
34 |
35 | // NewReadiness returns prober to be used as a readiness probe.
36 | func NewReadiness(registry prometheus.Registerer, logger logrus.FieldLogger) (*Prober, error) {
37 | p, err := newProber(registry, logger, "readiness")
38 | if err != nil {
39 | return nil, err
40 | }
41 | p.NotOk(ErrDefault)
42 | return p, nil
43 | }
44 |
45 | func newProber(registry prometheus.Registerer, logger logrus.FieldLogger, name string) (*Prober, error) {
46 | p := Prober{
47 | name: name,
48 | statusMtx: sync.Mutex{},
49 | logger: logger,
50 | }
51 | if err := registry.Register(status); err != nil {
52 | if !errors.As(err, &prometheus.AlreadyRegisteredError{}) {
53 | return nil, err
54 | }
55 | }
56 | return &p, nil
57 | }
58 |
59 | // Prober is struct holding information about status.
60 | type Prober struct {
61 | name string
62 | status error
63 | statusMtx sync.Mutex
64 | logger logrus.FieldLogger
65 | }
66 |
67 | // Ok sets the Prober to correct status.
68 | func (p *Prober) Ok() {
69 | p.setStatus(nil)
70 | }
71 |
72 | // NotOk sets the Prober to not ready status and specifies reason as an error.
73 | func (p *Prober) NotOk(err error) {
74 | p.setStatus(err)
75 | }
76 |
77 | // IsOk returns reason why Prober is not ok. If it is it returns nil.
78 | func (p *Prober) IsOk() error {
79 | p.statusMtx.Lock()
80 | defer p.statusMtx.Unlock()
81 | return p.status
82 | }
83 |
84 | // Allows to use Prober in HTTP life-cycle endpoints.
85 | func (p *Prober) HandleFunc(w http.ResponseWriter, _ *http.Request) {
86 | if p.IsOk() != nil {
87 | http.Error(w, p.IsOk().Error(), http.StatusServiceUnavailable)
88 | return
89 | }
90 | if _, err := w.Write([]byte("OK")); err != nil {
91 | p.logger.Errorf("error writing response: %v", err)
92 | }
93 | }
94 |
95 | func (p *Prober) setStatus(err error) {
96 | p.statusMtx.Lock()
97 | defer p.statusMtx.Unlock()
98 | if p.status != nil && err == nil {
99 | p.logger.Infof("changing %s status to ok", p.name)
100 | status.WithLabelValues(p.name).Set(1)
101 | }
102 | if p.status == nil && err != nil {
103 | p.logger.Warnf("changing %s status to not ok, reason: %+v", p.name, err)
104 | status.WithLabelValues(p.name).Set(0)
105 | }
106 | p.status = err
107 | }
108 |
--------------------------------------------------------------------------------
/pkg/prober/prober_test.go:
--------------------------------------------------------------------------------
1 | package prober
2 |
3 | import (
4 | "net/http"
5 | "net/http/httptest"
6 | "testing"
7 |
8 | "github.com/prometheus/client_golang/prometheus"
9 | "github.com/sirupsen/logrus"
10 | "github.com/stretchr/testify/assert"
11 | )
12 |
13 | func TestProber(t *testing.T) {
14 | p, err := NewLiveness(prometheus.NewRegistry(), logrus.New())
15 | assert.NoError(t, err)
16 | p.Ok()
17 | assert.Equal(t, nil, p.IsOk())
18 | p.NotOk(ErrDefault)
19 | assert.Equal(t, ErrDefault, p.IsOk())
20 | p.Ok()
21 | assert.Equal(t, nil, p.IsOk())
22 | }
23 |
24 | func TestProber_HandleFunc(t *testing.T) {
25 | p, err := NewLiveness(prometheus.NewRegistry(), logrus.New())
26 | assert.NoError(t, err)
27 | req, err := http.NewRequest(http.MethodGet, "/liveness", http.NoBody)
28 | if err != nil {
29 | t.Fatal(err)
30 | }
31 | handler := http.HandlerFunc(p.HandleFunc)
32 |
33 | rr := httptest.NewRecorder()
34 | handler.ServeHTTP(rr, req)
35 | assert.Equal(t, http.StatusOK, rr.Code)
36 |
37 | rr = httptest.NewRecorder()
38 | p.NotOk(ErrDefault)
39 | handler.ServeHTTP(rr, req)
40 | assert.Equal(t, http.StatusServiceUnavailable, rr.Code)
41 |
42 | rr = httptest.NewRecorder()
43 | p.Ok()
44 | handler.ServeHTTP(rr, req)
45 | assert.Equal(t, http.StatusOK, rr.Code)
46 | }
47 |
--------------------------------------------------------------------------------
/pkg/prometheus_exporter/aggregating_counter_test.go:
--------------------------------------------------------------------------------
1 | package prometheus_exporter
2 |
3 | import (
4 | "strings"
5 | "testing"
6 |
7 | "github.com/prometheus/client_golang/prometheus"
8 | "github.com/prometheus/client_golang/prometheus/testutil"
9 | "github.com/seznam/slo-exporter/pkg/stringmap"
10 | "github.com/sirupsen/logrus"
11 | "github.com/stretchr/testify/assert"
12 | )
13 |
14 | func Test_aggregatingCounter(t *testing.T) {
15 | reg := prometheus.NewPedanticRegistry()
16 | aggVec := newAggregatedCounterVectorSet("slo_events_total", metricHelp, []string{
17 | "slo_domain",
18 | "slo_class",
19 | "slo_app",
20 | "event_key",
21 | }, logrus.New(), []string{})
22 | err := aggVec.register(reg)
23 | assert.NoError(t, err)
24 |
25 | expectedMetrics := `
26 | # HELP slo_domain:slo_events_total Total number of SLO events exported with it's result and metadata.
27 | # TYPE slo_domain:slo_events_total counter
28 | slo_domain:slo_events_total{result="success",slo_domain="domain"} 1
29 | # HELP slo_domain_slo_class:slo_events_total Total number of SLO events exported with it's result and metadata.
30 | # TYPE slo_domain_slo_class:slo_events_total counter
31 | slo_domain_slo_class:slo_events_total{result="success",slo_class="critical",slo_domain="domain"} 1
32 | # HELP slo_domain_slo_class_slo_app:slo_events_total Total number of SLO events exported with it's result and metadata.
33 | # TYPE slo_domain_slo_class_slo_app:slo_events_total counter
34 | slo_domain_slo_class_slo_app:slo_events_total{result="success",slo_app="app",slo_class="critical",slo_domain="domain"} 1
35 | # HELP slo_domain_slo_class_slo_app_event_key:slo_events_total Total number of SLO events exported with it's result and metadata.
36 | # TYPE slo_domain_slo_class_slo_app_event_key:slo_events_total counter
37 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="key",result="success",slo_app="app",slo_class="critical",slo_domain="domain"} 1
38 | `
39 |
40 | aggVec.inc(stringmap.StringMap{
41 | "result": "success",
42 | "slo_class": "critical",
43 | "slo_app": "app",
44 | "slo_domain": "domain",
45 | "event_key": "key",
46 | })
47 |
48 | if err := testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics)); err != nil {
49 | t.Errorf("unexpected collecting result:\n%s", err)
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/pkg/prometheus_exporter/exemplars.go:
--------------------------------------------------------------------------------
1 | package prometheus_exporter
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | "unicode/utf8"
7 |
8 | "github.com/prometheus/client_golang/prometheus"
9 | dto "github.com/prometheus/client_model/go"
10 | "google.golang.org/protobuf/proto"
11 | "google.golang.org/protobuf/types/known/timestamppb"
12 | )
13 |
14 | // Support for exemplars is still considered experimental both, in Prometheus and in the client library.
15 | // The client still does not allow to set the exemplars for const metrics intended to be used for exporters.
16 | // As a workaround part of the functionality had to be copied out from the client and new custom constCounterWithExemplar had to be created.
17 | // This is definitely not ideal but if we want to support this functionality, there is no other way for now.
18 |
19 | // FIXME once implemented, use client constMetric exemplars support
20 |
21 | // Copied from https://github.com/prometheus/client_golang/blob/0400fc44d42dd0bca7fb16e87ea0313bb2eb8c53/prometheus/value.go#L183 since there is no exposed API for now.
22 | func newExemplar(value float64, ts time.Time, l prometheus.Labels) (*dto.Exemplar, error) {
23 | e := &dto.Exemplar{}
24 | e.Value = proto.Float64(value)
25 | tsProto := timestamppb.New(ts)
26 | e.Timestamp = tsProto
27 | labelPairs := make([]*dto.LabelPair, 0, len(l))
28 | var runes int
29 | for name, value := range l {
30 | // Also not exported
31 | // if !checkLabelName(name) {
32 | // return nil, fmt.Errorf("exemplar label name %q is invalid", name)
33 | //}
34 | runes += utf8.RuneCountInString(name)
35 | if !utf8.ValidString(value) {
36 | return nil, fmt.Errorf("exemplar label value %q is not valid UTF-8", value)
37 | }
38 | runes += utf8.RuneCountInString(value)
39 | labelPairs = append(labelPairs, &dto.LabelPair{
40 | Name: proto.String(name),
41 | Value: proto.String(value),
42 | })
43 | }
44 | if runes > prometheus.ExemplarMaxRunes {
45 | return nil, fmt.Errorf("exemplar labels have %d runes, exceeding the limit of %d", runes, prometheus.ExemplarMaxRunes)
46 | }
47 | e.Label = labelPairs
48 | return e, nil
49 | }
50 |
51 | func NewConstCounterWithExemplar(desc *prometheus.Desc, _ prometheus.ValueType, value float64, labelValues ...string) (ConstCounterWithExemplar, error) {
52 | return ConstCounterWithExemplar{
53 | desc: desc,
54 | val: value,
55 | labelPairs: prometheus.MakeLabelPairs(desc, labelValues),
56 | exemplar: nil,
57 | }, nil
58 | }
59 |
60 | type ExemplarAdder interface {
61 | AddExemplar(exemplar *dto.Exemplar)
62 | }
63 |
64 | type ConstCounterWithExemplar struct {
65 | desc *prometheus.Desc
66 | val float64
67 | labelPairs []*dto.LabelPair
68 | exemplar *dto.Exemplar
69 | }
70 |
71 | func (c *ConstCounterWithExemplar) AddExemplar(e *dto.Exemplar) {
72 | c.exemplar = e
73 | }
74 |
75 | func (c ConstCounterWithExemplar) Desc() *prometheus.Desc {
76 | return c.desc
77 | }
78 |
79 | func (c ConstCounterWithExemplar) Write(metric *dto.Metric) error {
80 | metric.Label = c.labelPairs
81 | metric.Counter = &dto.Counter{Value: &c.val, Exemplar: c.exemplar}
82 | return nil
83 | }
84 |
--------------------------------------------------------------------------------
/pkg/prometheus_ingester/headerRoundTripper.go:
--------------------------------------------------------------------------------
1 | package prometheus_ingester
2 |
3 | import "net/http"
4 |
5 | type httpHeadersRoundTripper struct {
6 | headers map[string]string
7 | roudTripper http.RoundTripper
8 | }
9 |
10 | // RoundTrip implements the http.RoundTripper interface.
11 | func (h httpHeadersRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) {
12 | // We use RoundTripper to inject HTTP headers even though it is not advised,
13 | // but the Prometheus client does not allow us to do it otherwise.
14 | for k, v := range h.headers {
15 | r.Header.Set(k, v)
16 | }
17 |
18 | return h.roudTripper.RoundTrip(r)
19 | }
20 |
--------------------------------------------------------------------------------
/pkg/prometheus_ingester/headerRoundTripper_test.go:
--------------------------------------------------------------------------------
1 | package prometheus_ingester
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "io"
7 | "net/http"
8 | "net/url"
9 | "testing"
10 |
11 | "github.com/prometheus/client_golang/api"
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | type testHTTPHeaderRoundTripper struct {
16 | expectedHeaders http.Header
17 | t *testing.T
18 | }
19 |
20 | func (rt *testHTTPHeaderRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
21 | assert.Equal(rt.t, rt.expectedHeaders, req.Header)
22 |
23 | return &http.Response{
24 | StatusCode: http.StatusOK,
25 | Body: io.NopCloser(bytes.NewBufferString("ahoj")),
26 | Header: http.Header{
27 | "Content-Type": []string{"application/json"},
28 | },
29 | }, nil
30 | }
31 |
32 | func testHTTPHeaderRoundTripperMapToHeaders(data map[string]string) http.Header {
33 | h := http.Header{}
34 | for k, v := range data {
35 | h.Set(k, v)
36 | }
37 | return h
38 | }
39 |
40 | func Test_httpHeadersRoundTripper_RoundTrip(t *testing.T) {
41 | tests := []struct {
42 | name string
43 | initialHeaders map[string]string
44 | appendedHeaders map[string]string
45 | expectedHeaders map[string]string
46 | }{
47 | {
48 | name: "have header and append header",
49 | initialHeaders: map[string]string{"header1": "value1"},
50 | appendedHeaders: map[string]string{"appendedHeader": "appendedHeaderValue"},
51 | expectedHeaders: map[string]string{"appendedHeader": "appendedHeaderValue", "header1": "value1"},
52 | },
53 | {
54 | name: "only append header",
55 | initialHeaders: map[string]string{},
56 | appendedHeaders: map[string]string{"appendedHeader": "appendedHeaderValue"},
57 | expectedHeaders: map[string]string{"appendedHeader": "appendedHeaderValue"},
58 | },
59 | {
60 | name: "have header and not append header",
61 | initialHeaders: map[string]string{"header1": "value1"},
62 | appendedHeaders: map[string]string{},
63 | expectedHeaders: map[string]string{"header1": "value1"},
64 | },
65 | {
66 | name: "empty headers",
67 | initialHeaders: map[string]string{},
68 | appendedHeaders: map[string]string{},
69 | expectedHeaders: map[string]string{},
70 | },
71 | {
72 | name: "append multiple headers",
73 | initialHeaders: map[string]string{},
74 | appendedHeaders: map[string]string{"appendedHeader1": "appendedHeaderValue1", "appendedHeader2": "appendedHeaderValue2"},
75 | expectedHeaders: map[string]string{"appendedHeader1": "appendedHeaderValue1", "appendedHeader2": "appendedHeaderValue2"},
76 | },
77 | {
78 | name: "overwrite header",
79 | initialHeaders: map[string]string{"header": "value"},
80 | appendedHeaders: map[string]string{"header": "newValue"},
81 | expectedHeaders: map[string]string{"header": "newValue"},
82 | },
83 | }
84 | for _, tt := range tests {
85 | t.Run(tt.name, func(t *testing.T) {
86 | rt := httpHeadersRoundTripper{
87 | headers: tt.appendedHeaders,
88 | roudTripper: &testHTTPHeaderRoundTripper{
89 | expectedHeaders: testHTTPHeaderRoundTripperMapToHeaders(tt.expectedHeaders),
90 | t: t,
91 | },
92 | }
93 |
94 | c, err := api.NewClient(api.Config{
95 | Address: "http://fake-address",
96 | RoundTripper: rt,
97 | })
98 | if err != nil {
99 | t.Fatal(err)
100 | }
101 | r := &http.Request{
102 | URL: &url.URL{Scheme: "http", Host: "fake-host", Path: "/"},
103 | Header: testHTTPHeaderRoundTripperMapToHeaders(tt.initialHeaders),
104 | }
105 | if _, _, err = c.Do(context.Background(), r); err != nil {
106 | t.Fatal(err)
107 | }
108 | })
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/pkg/prometheus_ingester/query_executor_test.go:
--------------------------------------------------------------------------------
1 | package prometheus_ingester
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/prometheus/common/model"
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func Test_queryResult_applyStaleness(t *testing.T) {
12 | ts := model.Time(0)
13 | fingerprint := model.Fingerprint(0)
14 | tests := []struct {
15 | name string
16 | input queryResult
17 | staleness time.Duration
18 | ts time.Time
19 | expectedMetrics int
20 | }{
21 | {
22 | name: "keep recent samples",
23 | ts: ts.Time().Add(time.Minute),
24 | staleness: defaultStaleness,
25 | input: queryResult{
26 | timestamp: ts.Time(),
27 | metrics: map[model.Fingerprint]model.SamplePair{
28 | fingerprint: {
29 | Timestamp: ts,
30 | Value: 0,
31 | },
32 | },
33 | },
34 | expectedMetrics: 1,
35 | },
36 | {
37 | name: "drop outdated samples",
38 | ts: ts.Time().Add(time.Minute + defaultStaleness),
39 | staleness: defaultStaleness,
40 | input: queryResult{
41 | timestamp: ts.Time(),
42 | metrics: map[model.Fingerprint]model.SamplePair{
43 | fingerprint: {
44 | Timestamp: ts,
45 | Value: 0,
46 | },
47 | },
48 | },
49 | expectedMetrics: 0,
50 | },
51 | }
52 | for _, tt := range tests {
53 | t.Run(tt.name, func(t *testing.T) {
54 | tt.input.dropStaleResults(tt.staleness, tt.ts)
55 | assert.Equalf(t, tt.expectedMetrics, len(tt.input.metrics), "unexpected number of metrics in result: %s", tt.input.metrics)
56 | })
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/pkg/relabel/relabel.go:
--------------------------------------------------------------------------------
1 | package relabel
2 |
3 | import (
4 | "fmt"
5 | "time"
6 |
7 | "github.com/prometheus/client_golang/prometheus"
8 | "github.com/seznam/slo-exporter/pkg/event"
9 | "github.com/seznam/slo-exporter/pkg/pipeline"
10 | "github.com/seznam/slo-exporter/pkg/stringmap"
11 | "github.com/spf13/viper"
12 | "gopkg.in/yaml.v2"
13 |
14 | "github.com/prometheus/prometheus/pkg/relabel"
15 | "github.com/sirupsen/logrus"
16 | )
17 |
18 | var droppedEventsTotal = prometheus.NewCounter(prometheus.CounterOpts{
19 | Name: "dropped_events_total",
20 | Help: "Total number of dropped events.",
21 | })
22 |
23 | func NewFromViper(viperConfig *viper.Viper, logger logrus.FieldLogger) (*EventRelabelManager, error) {
24 | // Viper unmarshal the nested structure to nested structure of interface{} types.
25 | // Prometheus relabel uses classic YAML unmarshalling so we marshall the structure to YAML again and then let
26 | // Prometheus code validate it and unmarshall it.
27 | var relabelConf []relabel.Config
28 | marshalledConfig, err := yaml.Marshal(viperConfig.Get("EventRelabelConfigs"))
29 | if err != nil {
30 | return nil, fmt.Errorf("failed to load configuration: %w", err)
31 | }
32 | if err := yaml.UnmarshalStrict(marshalledConfig, &relabelConf); err != nil {
33 | return nil, fmt.Errorf("failed to load configuration: %w", err)
34 | }
35 | return NewFromConfig(relabelConf, logger)
36 | }
37 |
38 | // New returns requestNormalizer which allows to add Key to RequestEvent.
39 | func NewFromConfig(relabelConfig []relabel.Config, logger logrus.FieldLogger) (*EventRelabelManager, error) {
40 | relabelManager := EventRelabelManager{
41 | relabelConfig: relabelConfig,
42 | outputChannel: make(chan *event.Raw),
43 | logger: logger,
44 | }
45 | return &relabelManager, nil
46 | }
47 |
48 | type EventRelabelManager struct {
49 | relabelConfig []relabel.Config
50 | observer pipeline.EventProcessingDurationObserver
51 | inputChannel chan *event.Raw
52 | outputChannel chan *event.Raw
53 | done bool
54 | logger logrus.FieldLogger
55 | }
56 |
57 | func (r *EventRelabelManager) String() string {
58 | return "relabel"
59 | }
60 |
61 | func (r *EventRelabelManager) Done() bool {
62 | return r.done
63 | }
64 |
65 | func (r *EventRelabelManager) RegisterMetrics(_, wrappedRegistry prometheus.Registerer) error {
66 | return wrappedRegistry.Register(droppedEventsTotal)
67 | }
68 |
69 | func (r *EventRelabelManager) SetInputChannel(channel chan *event.Raw) {
70 | r.inputChannel = channel
71 | }
72 |
73 | func (r *EventRelabelManager) OutputChannel() chan *event.Raw {
74 | return r.outputChannel
75 | }
76 |
77 | func (r *EventRelabelManager) Stop() {}
78 |
79 | func (r *EventRelabelManager) RegisterEventProcessingDurationObserver(observer pipeline.EventProcessingDurationObserver) {
80 | r.observer = observer
81 | }
82 |
83 | func (r *EventRelabelManager) observeDuration(start time.Time) {
84 | if r.observer != nil {
85 | r.observer.Observe(time.Since(start).Seconds())
86 | }
87 | }
88 |
89 | // relabelEvent applies the relabel configs on the event metadata.
90 | // If event is about to be dropped, nil is returned.
91 | func (r *EventRelabelManager) relabelEvent(e *event.Raw) *event.Raw {
92 | newLabels := e.Metadata.AsPrometheusLabels()
93 | for _, relabelConfigRule := range r.relabelConfig {
94 | newLabels = relabel.Process(newLabels, &relabelConfigRule)
95 | if newLabels == nil {
96 | return nil
97 | }
98 | }
99 | e.Metadata = stringmap.NewFromLabels(newLabels)
100 | return e
101 | }
102 |
103 | // Run event replacer receiving events and filling their Key if not already filled.
104 | func (r *EventRelabelManager) Run() {
105 | go func() {
106 | defer func() {
107 | close(r.outputChannel)
108 | r.done = true
109 | }()
110 | for newEvent := range r.inputChannel {
111 | start := time.Now()
112 | relabeledEvent := r.relabelEvent(newEvent)
113 | if relabeledEvent == nil {
114 | r.logger.WithField("event", newEvent).Debug("dropping event")
115 | droppedEventsTotal.Inc()
116 | continue
117 | }
118 | r.logger.WithField("event", newEvent).Debug("relabeled event")
119 | r.outputChannel <- relabeledEvent
120 | r.observeDuration(start)
121 | }
122 | r.logger.Info("input channel closed, finishing")
123 | }()
124 | }
125 |
--------------------------------------------------------------------------------
/pkg/relabel/relabel_test.go:
--------------------------------------------------------------------------------
1 | package relabel
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/prometheus/prometheus/pkg/relabel"
8 | "github.com/seznam/slo-exporter/pkg/event"
9 | "github.com/sirupsen/logrus"
10 | "github.com/stretchr/testify/assert"
11 | "gopkg.in/yaml.v2"
12 |
13 | "github.com/spf13/viper"
14 | )
15 |
16 | type testCase struct {
17 | name string
18 | inputEvent *event.Raw
19 | outputEvent *event.Raw
20 | }
21 |
22 | var testCases = []testCase{
23 | {
24 | name: "relabel event with empty metadata",
25 | inputEvent: &event.Raw{Metadata: map[string]string{}},
26 | outputEvent: &event.Raw{Metadata: map[string]string{}},
27 | },
28 | {
29 | name: "relabel event with simple metadata that will not be modified",
30 | inputEvent: &event.Raw{Metadata: map[string]string{"foo": "bar"}},
31 | outputEvent: &event.Raw{Metadata: map[string]string{"foo": "bar"}},
32 | },
33 | {
34 | name: "relabel event which should be dropped",
35 | inputEvent: &event.Raw{Metadata: map[string]string{"to_be_dropped": "true"}},
36 | outputEvent: nil,
37 | },
38 | {
39 | name: "relabel event where label should be dropped",
40 | inputEvent: &event.Raw{Metadata: map[string]string{"foo": "bar", "label_to_be_dropped": "xxx"}},
41 | outputEvent: &event.Raw{Metadata: map[string]string{"foo": "bar"}},
42 | },
43 | {
44 | name: "relabel event where get parameter of url is parsed out to new label",
45 | inputEvent: &event.Raw{Metadata: map[string]string{"url": "http://foo.bar:8080?operationName=test-operation"}},
46 | outputEvent: &event.Raw{Metadata: map[string]string{"url": "http://foo.bar:8080?operationName=test-operation", "operation_name": "test-operation"}},
47 | },
48 | {
49 | name: "relabel event to add all labels with prefix http_ as new labels without the prefix",
50 | inputEvent: &event.Raw{Metadata: map[string]string{"http_status": "200", "http_method": "POST"}},
51 | outputEvent: &event.Raw{Metadata: map[string]string{"http_status": "200", "http_method": "POST", "status": "200", "method": "POST"}},
52 | },
53 | }
54 |
55 | func TestRelabel_Run(t *testing.T) {
56 | configYaml := `
57 | - source_labels: ["to_be_dropped"]
58 | regex: "true"
59 | action: drop
60 | - regex: "label_to_be_dropped"
61 | action: labeldrop
62 | - source_labels: ["url"]
63 | regex: ".*operationName=(.*)(&.*)?$"
64 | target_label: operation_name
65 | replacement: "$1"
66 | - source_labels: ["url"]
67 | regex: ".*operationName=(.*)(&.*)?$"
68 | action: replace
69 | target_label: operation_name
70 | replacement: "$1"
71 | - action: labelmap
72 | regex: "http_(.*)"
73 | replacement: "$1"
74 | `
75 | var config []relabel.Config
76 | err := yaml.UnmarshalStrict([]byte(configYaml), &config)
77 | if err != nil {
78 | t.Fatal(err)
79 | }
80 | mgr, err := NewFromConfig(config, logrus.New())
81 | if err != nil {
82 | t.Fatal(err)
83 | }
84 |
85 | for _, testCase := range testCases {
86 | t.Run(testCase.name, func(t *testing.T) {
87 | assert.Equal(t, testCase.outputEvent, mgr.relabelEvent(testCase.inputEvent))
88 | })
89 | }
90 | }
91 |
92 | func TestRlabel_NewFromViper(t *testing.T) {
93 | t.Run("returns error when yaml config contains unknown keys", func(t *testing.T) {
94 | config := []byte(`
95 | eventRelabelConfigs:
96 | eventRelabelConfigs:
97 | - source_labels: ["url"]
98 | regexP: ".*operationName=(.*)(&.*)?$"
99 | target_label: operation_name
100 | replacement: "$1" `)
101 | viper.SetConfigType("yaml")
102 | err := viper.ReadConfig(bytes.NewBuffer(config))
103 | assert.Nilf(t, err, "Unexpected error occurred: %s", err)
104 | vc := viper.Sub("EventRelabelConfigs")
105 | _, err = NewFromViper(vc, logrus.New())
106 | assert.NotNilf(t, err, "Expected error but no one occurred")
107 | })
108 | }
109 |
--------------------------------------------------------------------------------
/pkg/slo_event_producer/config.go:
--------------------------------------------------------------------------------
1 | package slo_event_producer
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/seznam/slo-exporter/pkg/stringmap"
8 | "gopkg.in/yaml.v2"
9 | )
10 |
11 | type sloMatcher struct {
12 | DomainRegexp string `yaml:"domain"`
13 | ClassRegexp string `yaml:"class"`
14 | AppRegexp string `yaml:"app"`
15 | }
16 |
17 | type operatorOptions struct {
18 | Operator string `yaml:"operator"`
19 | Key string `yaml:"key"`
20 | Value string `yaml:"value"`
21 | }
22 |
23 | type ruleOptions struct {
24 | MetadataMatcherConditionsOptions []operatorOptions `yaml:"metadata_matcher"`
25 | SloMatcher sloMatcher `yaml:"slo_matcher"`
26 | FailureConditionsOptions []operatorOptions `yaml:"failure_conditions"`
27 | AdditionalMetadata stringmap.StringMap `yaml:"additional_metadata,omitempty"`
28 | }
29 |
30 | type rulesConfig struct {
31 | Rules []ruleOptions `yaml:"rules"`
32 | }
33 |
34 | func (rc *rulesConfig) loadFromFile(path string) error {
35 | yamlFile, err := os.ReadFile(path)
36 | if err != nil {
37 | return fmt.Errorf("failed to load configuration file: %w", err)
38 | }
39 | err = yaml.UnmarshalStrict(yamlFile, rc)
40 | if err != nil {
41 | return fmt.Errorf("failed to unmarshall configuration file: %w", err)
42 | }
43 | return nil
44 | }
45 |
--------------------------------------------------------------------------------
/pkg/slo_event_producer/config_test.go:
--------------------------------------------------------------------------------
1 | package slo_event_producer
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/seznam/slo-exporter/pkg/stringmap"
7 | "github.com/stretchr/testify/assert"
8 | )
9 |
10 | type configTestCase struct {
11 | name string
12 | path string
13 | expectedConfig rulesConfig
14 | expectedError bool
15 | }
16 |
17 | func TestConfig_loadFromFile(t *testing.T) {
18 | testCases := []configTestCase{
19 | {
20 | name: "slo rules file with valid syntax",
21 | path: "testdata/slo_rules_valid.yaml.golden",
22 | expectedConfig: rulesConfig{Rules: []ruleOptions{
23 | {
24 | SloMatcher: sloMatcher{DomainRegexp: "domain"},
25 | FailureConditionsOptions: []operatorOptions{
26 | {
27 | Operator: "numberIsHigherThan", Key: "statusCode", Value: "500",
28 | },
29 | },
30 | AdditionalMetadata: stringmap.StringMap{"slo_type": "availability"},
31 | },
32 | }},
33 | expectedError: false,
34 | },
35 | {
36 | name: "slo_rules file with invalid syntax",
37 | path: "testdata/slo_rules_invalid.yaml.golden",
38 | expectedConfig: rulesConfig{},
39 | expectedError: true,
40 | },
41 | {
42 | name: "invalid path",
43 | path: "?????",
44 | expectedConfig: rulesConfig{},
45 | expectedError: true,
46 | },
47 | }
48 |
49 | for _, c := range testCases {
50 | t.Run(
51 | c.name,
52 | func(t *testing.T) {
53 | var config rulesConfig
54 | err := config.loadFromFile(c.path)
55 | if c.expectedError {
56 | assert.Error(t, err)
57 | return
58 | }
59 | assert.Equal(t, c.expectedConfig, config, "failed config test for path %s", c.path)
60 | },
61 | )
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/pkg/slo_event_producer/testdata/slo_rules_invalid.yaml.golden:
--------------------------------------------------------------------------------
1 | rules:
2 | - failure_condddddddditions:
3 | - operator: numberIsHigherThan
4 | key: "statusCode"
5 | value: 500
6 | hahahahaha:
7 |
--------------------------------------------------------------------------------
/pkg/slo_event_producer/testdata/slo_rules_valid.yaml.golden:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: domain
4 | failure_conditions:
5 | - operator: numberIsHigherThan
6 | key: "statusCode"
7 | value: 500
8 | additional_metadata:
9 | slo_type: availability
10 |
--------------------------------------------------------------------------------
/pkg/storage/capped_container_test.go:
--------------------------------------------------------------------------------
1 | package storage
2 |
3 | import (
4 | "fmt"
5 | "reflect"
6 | "testing"
7 |
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func Test_Container_Capacity(t *testing.T) {
12 | tests := []struct {
13 | name string
14 | capacity int
15 | }{
16 | {name: "check capacity", capacity: 0},
17 | {name: "check capacity", capacity: 3},
18 | {name: "check capacity", capacity: 100},
19 | }
20 | for _, tt := range tests {
21 | containers := []CappedContainer{
22 | NewInMemoryCappedContainer(tt.capacity),
23 | }
24 | for _, c := range containers {
25 | t.Run(fmt.Sprintf(" | %s | %s of %d ", reflect.TypeOf(c).String(), tt.name, tt.capacity), func(t *testing.T) {
26 | assert.Equal(t, c.Capacity(), tt.capacity, fmt.Sprintf("Expected container capacity: %d, but got: %d", tt.capacity, c.Len()))
27 | })
28 | }
29 | }
30 | }
31 |
32 | func Test_CappedContainer_Capping(t *testing.T) {
33 | tests := []struct {
34 | name string
35 | capacity int
36 | itemsToAdd []interface{}
37 | expectedItems []interface{}
38 | }{
39 | {name: "container with negative capacity", capacity: -1, itemsToAdd: []interface{}{1, 2, 3}, expectedItems: []interface{}{}},
40 | {name: "container with zero capacity", capacity: 0, itemsToAdd: []interface{}{1, 2, 3}, expectedItems: []interface{}{}},
41 | {name: "container with no capacity limit", capacity: 100, itemsToAdd: []interface{}{1, 2, 3}, expectedItems: []interface{}{1, 2, 3}},
42 | {name: "container with limited capacity", capacity: 3, itemsToAdd: []interface{}{1, 2, 3, 4, 5}, expectedItems: []interface{}{3, 4, 5}},
43 | }
44 | for _, tt := range tests {
45 | containers := []Container{
46 | NewInMemoryCappedContainer(tt.capacity),
47 | }
48 | for _, c := range containers {
49 | t.Run(fmt.Sprintf(" | %s | %s", reflect.TypeOf(c).String(), tt.name), func(t *testing.T) {
50 | for _, item := range tt.itemsToAdd {
51 | c.Add(item)
52 | }
53 | var streamedItems []interface{}
54 | for i := range c.Stream() {
55 | streamedItems = append(streamedItems, i)
56 | }
57 | assert.ElementsMatch(t, streamedItems, tt.expectedItems, fmt.Sprintf("Expected streamed items: %s, but got: %s", tt.expectedItems, streamedItems))
58 | })
59 | }
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/pkg/storage/container_test.go:
--------------------------------------------------------------------------------
1 | package storage
2 |
3 | import (
4 | "fmt"
5 | "reflect"
6 | "testing"
7 |
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func Test_Container_Add(t *testing.T) {
12 | tests := []struct {
13 | name string
14 | item interface{}
15 | }{
16 | {name: "add number", item: 1},
17 | {name: "add string", item: "foo"},
18 | {name: "add struct", item: struct{}{}},
19 | }
20 | for _, tt := range tests {
21 | containerCapacity := 1
22 | containers := []Container{
23 | NewInMemoryCappedContainer(containerCapacity),
24 | }
25 | for _, c := range containers {
26 | t.Run(fmt.Sprintf(" | %s | %s", reflect.TypeOf(c).String(), tt.name), func(_ *testing.T) {
27 | c.Add(tt.item)
28 | })
29 | }
30 | }
31 | }
32 |
33 | func Test_Container_Len(t *testing.T) {
34 | tests := []struct {
35 | name string
36 | numberOfItems int
37 | }{
38 | {name: "check length", numberOfItems: 0},
39 | {name: "check length", numberOfItems: 3},
40 | {name: "check length", numberOfItems: 100},
41 | }
42 | for _, tt := range tests {
43 | containers := []Container{
44 | NewInMemoryCappedContainer(tt.numberOfItems),
45 | }
46 | for _, c := range containers {
47 | t.Run(fmt.Sprintf(" | %s | %s with %d items", reflect.TypeOf(c).String(), tt.name, tt.numberOfItems), func(t *testing.T) {
48 | for i := 0; i < tt.numberOfItems; i++ {
49 | c.Add(struct{}{})
50 | }
51 | assert.Equal(t, c.Len(), tt.numberOfItems, fmt.Sprintf("Expected container length: %d, but got: %d", tt.numberOfItems, c.Len()))
52 | })
53 | }
54 | }
55 | }
56 |
57 | func Test_Container_Stream(t *testing.T) {
58 | tests := []struct {
59 | name string
60 | items []interface{}
61 | }{
62 | {name: "stream numbers", items: []interface{}{1, 2, 3}},
63 | {name: "stream strings", items: []interface{}{"a", "b", "c"}},
64 | {name: "stream structs", items: []interface{}{struct{}{}, struct{}{}, struct{}{}}},
65 | }
66 | for _, tt := range tests {
67 | containers := []Container{
68 | NewInMemoryCappedContainer(len(tt.items)),
69 | }
70 | for _, c := range containers {
71 | t.Run(fmt.Sprintf(" | %s | %s", reflect.TypeOf(c).String(), tt.name), func(t *testing.T) {
72 | for _, item := range tt.items {
73 | c.Add(item)
74 | }
75 | assert.Equal(t, c.Len(), len(tt.items))
76 | var streamedItems []interface{}
77 | for i := range c.Stream() {
78 | streamedItems = append(streamedItems, i)
79 | }
80 | assert.ElementsMatch(t, streamedItems, tt.items, fmt.Sprintf("Expected streamed items: %s, but got: %s", tt.items, streamedItems))
81 | })
82 | }
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/pkg/storage/in_memory.go:
--------------------------------------------------------------------------------
1 | package storage
2 |
3 | import (
4 | "container/list"
5 | "sync"
6 | )
7 |
8 | // NewInMemoryCappedContainer create new in-memory capped container with capacity limit.
9 | func NewInMemoryCappedContainer(capacity int) CappedContainer {
10 | return &inMemoryCappedContainer{
11 | list: list.New(),
12 | capacity: capacity,
13 | }
14 | }
15 |
16 | type inMemoryCappedContainer struct {
17 | list *list.List
18 | capacity int
19 | lock sync.RWMutex
20 | }
21 |
22 | // Len returns current size of container.
23 | func (h *inMemoryCappedContainer) Len() int {
24 | h.lock.RLock()
25 | defer h.lock.RUnlock()
26 | return h.list.Len()
27 | }
28 |
29 | // Capacity returns maximum limit of the capped container.
30 | func (h *inMemoryCappedContainer) Capacity() int {
31 | return h.capacity
32 | }
33 |
34 | // Add adds new item to container.
35 | func (h *inMemoryCappedContainer) Add(record interface{}) {
36 | h.lock.Lock()
37 | defer h.lock.Unlock()
38 | h.list.PushFront(record)
39 |
40 | // Drop items exceeding capacity limit.
41 | if h.list.Len() > h.capacity {
42 | h.list.Remove(h.list.Back())
43 | }
44 | }
45 |
46 | // Stream writes items to returned channel.
47 | func (h *inMemoryCappedContainer) Stream() <-chan interface{} {
48 | stream := make(chan interface{})
49 | go func() {
50 | h.lock.RLock()
51 | defer h.lock.RUnlock()
52 | for e := h.list.Front(); e != nil; e = e.Next() {
53 | stream <- e.Value
54 | }
55 | close(stream)
56 | }()
57 | return stream
58 | }
59 |
--------------------------------------------------------------------------------
/pkg/storage/interfaces.go:
--------------------------------------------------------------------------------
1 | package storage
2 |
3 | type Container interface {
4 | Add(item interface{})
5 | Stream() <-chan interface{}
6 | Len() int
7 | }
8 |
9 | type CappedContainer interface {
10 | Container
11 | Capacity() int
12 | }
13 |
--------------------------------------------------------------------------------
/pkg/stringmap/stringmap_benchmark_test.go:
--------------------------------------------------------------------------------
1 | package stringmap
2 |
3 | import (
4 | "strconv"
5 | "strings"
6 | "testing"
7 | )
8 |
9 | type benchmarkCase struct {
10 | name string
11 | data StringMap
12 | }
13 |
14 | func generateStringMap(keysCount, stringLen int) StringMap {
15 | newMap := make(StringMap, keysCount)
16 | for i := 0; i < keysCount; i++ {
17 | key := strings.Repeat(strconv.Itoa(i), stringLen)
18 | newMap[key] = key
19 | }
20 | return newMap
21 | }
22 |
23 | func BenchmarkStringMap(b *testing.B) {
24 | testCases := []benchmarkCase{
25 | {name: "small map/small keys", data: generateStringMap(3, 3)},
26 | {name: "small map/large keys", data: generateStringMap(3, 1000)},
27 | {name: "large map/small keys", data: generateStringMap(1000, 3)},
28 | {name: "large map/large keys", data: generateStringMap(1000, 1000)},
29 | }
30 | for _, tc := range testCases {
31 | b.Run("StringMap.Copy/"+tc.name, func(b *testing.B) {
32 | for n := 0; n < b.N; n++ {
33 | tc.data.Copy()
34 | }
35 | })
36 | b.Run("StringMap.Merge on : "+tc.name, func(b *testing.B) {
37 | for n := 0; n < b.N; n++ {
38 | tc.data.Merge(tc.data)
39 | }
40 | })
41 | b.Run("StringMap.Keys on : "+tc.name, func(b *testing.B) {
42 | for n := 0; n < b.N; n++ {
43 | tc.data.Keys()
44 | }
45 | })
46 | b.Run("StringMap.NewWith on : "+tc.name, func(b *testing.B) {
47 | for n := 0; n < b.N; n++ {
48 | tc.data.NewWith("foo", "bar")
49 | }
50 | })
51 | b.Run("StringMap.Select on : "+tc.name, func(b *testing.B) {
52 | for n := 0; n < b.N; n++ {
53 | tc.data.Select([]string{"a", "b", "c", "d", "e", "f", "g", "h"})
54 | }
55 | })
56 | b.Run("StringMap.SortedKeys on : "+tc.name, func(b *testing.B) {
57 | for n := 0; n < b.N; n++ {
58 | tc.data.SortedKeys()
59 | }
60 | })
61 | b.Run("StringMap.Without on : "+tc.name, func(b *testing.B) {
62 | for n := 0; n < b.N; n++ {
63 | tc.data.Without([]string{"a", "b", "c", "d", "e", "f", "g", "h"})
64 | }
65 | })
66 | b.Run("StringMap.String on : "+tc.name, func(b *testing.B) {
67 | for n := 0; n < b.N; n++ {
68 | _ = tc.data.String()
69 | }
70 | })
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/prometheus/alerts/error-budget.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-exporter-slo-error-budget-alerts
3 | rules:
4 |
5 | - alert: ErrorBudgetExhausted
6 | expr:
7 | slo:stable_version{enabled!="false"}
8 | * on(slo_version, slo_domain, namespace) group_right(escalate, team)
9 | slo:violation_ratio{slo_time_range="4w"}
10 | / on (slo_class,slo_domain,slo_version,slo_type,namespace) group_left ()
11 | (
12 | slo:violation_ratio_threshold - 1
13 | )
14 | +1 <= 0
15 | for: 10m
16 | labels:
17 | severity: warning
18 | alert_type: slo:error_budget_exhausted
19 | annotations:
20 | title: 'Error budget is exhausted.'
21 | description: '{{$labels.slo_type | title}} error budget for SLO domain "{{$labels.slo_domain}}" was exhausted.'
22 | playbook: howto/SLO-workflow.md
23 |
--------------------------------------------------------------------------------
/prometheus/alerts/missing_all_data.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-missing-all-data
3 | rules:
4 |
5 | # This is a generic alert which fires when there are no SLO data present
6 | # and thanos-rule is having trouble evaluating rules.
7 | # This typically means that gaps in SLO data are occurring.
8 | - alert: MissingAllSloData
9 | expr: |
10 | (1 - avg_over_time(
11 | (clamp_max(sum(absent(slo:events_over_time
12 | * on(slo_version, slo_domain, namespace) group_left(escalate, team) slo:stable_version{enabled!="false"}))
13 | AND sum(increase(prometheus_rule_evaluation_failures_total[5m]) > 0),0)
14 | OR clamp_max(sum(slo:events_over_time
15 | * on(slo_version, slo_domain, namespace) group_left(escalate, team) slo:stable_version{enabled!="false"}),1))[1h:])
16 | ) == 1
17 | for: 10m
18 | labels:
19 | severity: warning
20 | alert_type: slo:missing_data
21 | annotations:
22 | title: 'Missing SLO data.'
23 | description: 'No SLO events are occurring and thanos-rule has trouble evaluating rules.'
24 | playbook: "on-call/slo-missing-data.md"
25 |
--------------------------------------------------------------------------------
/prometheus/alerts/missing_data.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-missing-data
3 | rules:
4 |
5 | # Missing slo:burn_rate for enabled SLO domain which had non-zero events in the past 5 minutes
6 | - alert: MissingSloBurnRateDomain
7 | expr: |
8 | (
9 | sum(
10 | slo:stable_version{enabled="true"}
11 | ) by (slo_version, slo_domain, namespace, escalate, team)
12 | unless on (slo_domain, slo_version, namespace)
13 | (
14 | slo:burn_rate{slo_time_range='5m'}
15 | * on(slo_version, slo_domain, namespace) group_left(escalate, team)
16 | slo:stable_version{enabled="true"}
17 | )
18 | )
19 | and on(slo_version, slo_domain, namespace)
20 | sum(
21 | slo:events_over_time{slo_time_range="5m"}
22 | ) without (result, slo_type) > 0
23 | for: 5m
24 | labels:
25 | severity: critical
26 | alert_type: slo:missing_data
27 | annotations:
28 | title: 'Missing burn rate data for {{ $labels.slo_domain }}.'
29 | description: 'Burn rate probably failed to evaluate for {{ $labels.slo_domain }}.'
30 | playbook: "on-call/slo-missing-data.md"
31 |
--------------------------------------------------------------------------------
/prometheus/alerts/slo_data_corrections.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-data-corrections-alert
3 | interval: 3m
4 | rules:
5 |
6 | # This is generic alert which fires when there is not needed SLO data correction metric.
7 | # To make it work, please make sure you data correction rule exposes always 0 when it is not active.
8 | - alert: "Expired SLO data correction"
9 | expr: 'avg_over_time(slo:correction_window[10m]) == 0'
10 | for: 10m
11 | labels:
12 | severity: info
13 | team: sre
14 | annotations:
15 | title: "Some SLO data correction is no longer used"
16 | description: "SLO data correction for incident {{$labels.incident}} is no longer used. You can erase it."
17 | playbook: "howto/slo-data-correction.md"
18 |
19 |
20 |
--------------------------------------------------------------------------------
/prometheus/alerts/slo_exporter_alerts.yaml:
--------------------------------------------------------------------------------
1 | # This file contains an example set of alerts based on slo-exporter's application metrics.
2 | groups:
3 | - name: slo-exporter-alerts
4 | rules:
5 | - alert: SloExporterNotUp
6 | expr: (label_replace(avg_over_time(up{instance=~"slo-exporter.*"}[4m]), "pod", "$1", "instance", "(.*)") < 0.9) and on(namespace, pod) (kube_pod_status_ready{condition="true"}==1)
7 | for: 10m
8 | labels:
9 | severity: critical
10 | annotations:
11 | title: Prometheus failed to scrape SLO exporter metrics.
12 | description: Slo-exporter metrics cannot be scraped for more than 10 minutes.
13 |
14 | - alert: SloExporterNoNewLines
15 | expr: rate(slo_exporter_tailer_lines_read_total[1m]) == 0
16 | for: 10m
17 | labels:
18 | severity: warning
19 | annotations:
20 | title: No new lines has been processed in last 10m.
21 | description: SLO exporter did not process any new lines in last 10m.
22 |
23 | - alert: SloExporterFallingBehindShort
24 | expr: (slo_exporter_tailer_file_size_bytes - slo_exporter_tailer_file_offset_bytes) > 1024^2
25 | for: 10m
26 | labels:
27 | severity: warning
28 | annotations:
29 | title: Slo-exporter falling behind in processing of the tailed file.
30 | description: SLO exporter does not keep up with the amount of new logs, there is more than 1MB of unproccessed data for more than 10m.
31 |
32 | - alert: SloExporterFallingBehindLong
33 | expr: (slo_exporter_tailer_file_size_bytes - slo_exporter_tailer_file_offset_bytes) > 1024^2
34 | for: 2h
35 | labels:
36 | severity: critical
37 | annotations:
38 | title: Long-term falling behind in processing of the tailed file.
39 | description: SLO exporter does not keep up with the amount of new logs, there is more than 1MB of unproccessed data for more than 2h.
40 |
41 | - alert: SloExporterMalformedLines
42 | expr: increase(slo_exporter_tailer_malformed_lines_total[10m]) > 100
43 | labels:
44 | severity: warning
45 | annotations:
46 | title: There is higher number of malformed lines.
47 | description: SLO exporter tailer failed to process {{$value}} lines in last 10m.
48 |
49 | - alert: SloExporterPrometheusIngesterUnsupportedQueryResult
50 | expr: increase(slo_exporter_prometheus_ingester_unsupported_query_result_type_total[10m]) > 10
51 | labels:
52 | severity: critical
53 | annotations:
54 | title: Prometheus ingester query returns an unsupported type.
55 | description: SLO exporter Prometheus ingester module failed to process {{$value}} results in last 10m.
56 |
57 | - alert: SloExporterPrometheusIngesterQueryFails
58 | expr: increase(slo_exporter_prometheus_ingester_query_fails_total[10m]) > 10
59 | labels:
60 | severity: critical
61 | annotations:
62 | title: Prometheus ingester fails to execute query.
63 | description: SLO exporter Prometheus ingester module failed to execute {{$value}} queries in last 10m.
64 |
65 | - alert: SloExporterEventsNotMatchingAnyRules
66 | expr: increase(slo_exporter_slo_event_producer_events_not_matching_any_rule[1m]) > 0
67 | labels:
68 | severity: warning
69 | annotations:
70 | title: Some events did not match any SLO evaluation rule.
71 | description: SLO exporter encountered {{$value}} events which did not match any SLO evaluation rules in last 1m.
72 |
73 | - alert: SloExporterEventKeyLimitExceeded
74 | expr: slo_exporter_prometheus_exporter_event_keys >= slo_exporter_prometheus_exporter_event_keys_limit
75 | labels:
76 | severity: warning
77 | annotations:
78 | title: The limit of event keys in Prometheus metrics exceeded.
79 | description: SLO exporter hit the limit of maximum unique event keys exposed in Prometheus metrics, from now on it replaces any new encountered with placeholder.
80 |
81 | - alert: SloExporterUnclassifiedEvents
82 | expr: sum(slo_exporter_dynamic_classifier_events_processed_total{result="unclassified", status_code!~"[45].."}) by (namespace)
83 | labels:
84 | severity: warning
85 | annotations:
86 | title: Unclassified endpoints occured.
87 | description: SLO exporter is unable to classify some events.
--------------------------------------------------------------------------------
/prometheus/recording_rules/burn-rate.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-violation-ratio-and-burn-rate
3 | interval: 1m
4 | rules:
5 | - record: slo:violation_ratio
6 | expr: |
7 | (
8 | # Produce zero instead of NaN.
9 | 0 == sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) (
10 | slo:events_over_time
11 | )
12 | )
13 | or on(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version)
14 | # Otherwise, compute the actual violation ratio, if possible
15 | (
16 | sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) (
17 | slo:events_over_time{result="fail"}
18 | )
19 | /
20 | sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) (
21 | slo:events_over_time
22 | )
23 | )
24 | or on(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version)
25 | # Otherwise, if no failed events are present, return 0 violation_ratio for the given set of labels
26 | (
27 | 0 * count by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) (
28 | slo:events_over_time
29 | )
30 | )
31 | - record: slo:burn_rate
32 | expr:
33 | slo:violation_ratio
34 | / on(slo_class, slo_domain, slo_version, slo_type, namespace) group_left()
35 | (1 - slo:violation_ratio_threshold)
36 |
--------------------------------------------------------------------------------
/prometheus/recording_rules/error-budget.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | # Used for simple vizualizations in Grafana.
3 | - name: slo-error-budget
4 | interval: 1m
5 | rules:
6 | - record: slo:error_budget
7 | expr: |
8 | slo:violation_ratio{slo_time_range="4w"}
9 | * on (slo_domain,slo_version, namespace) group_left()
10 | max(slo:stable_version) by (slo_class,slo_domain,slo_version, slo_type, namespace)
11 | / on (slo_class,slo_domain,slo_version, slo_type, namespace) group_left ()
12 | (slo:violation_ratio_threshold - 1)
13 | + 1
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/prometheus/recording_rules/events-over-time.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-events-over-time-4w--interval-3m
3 | interval: 3m
4 | rules:
5 | - record: slo:events_over_time
6 | expr: last_over_time(slo:events_over_time:sparse{slo_time_range="4w"}[30m])
7 |
8 | #
9 | # 4w aggregation may be quite resource-intensive for the monitoring stack, so we evaluate it just once every 15 minutes.
10 | # This causes slo:events_over_time:sparse timeseries to be discontinuous making it most probably useless to be used directly unless last_over_time is applied.
11 | #
12 | # Sharded into 2 queries based on namespace to lower computational intensity
13 | - name: slo-events-over-time-4w--interval-15m-production
14 | interval: 15m
15 | rules:
16 | - record: slo:events_over_time:sparse
17 | labels:
18 | slo_time_range: 4w
19 | expr:
20 | sum(
21 | increase(slo_domain_slo_class:slo_events_total{namespace="production"}[4w])
22 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
23 | - name: slo-events-over-time-4w--interval-15m-not-production
24 | interval: 15m
25 | rules:
26 | - record: slo:events_over_time:sparse
27 | labels:
28 | slo_time_range: 4w
29 | expr:
30 | sum(
31 | increase(slo_domain_slo_class:slo_events_total{namespace!="production"}[4w])
32 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
33 |
34 |
35 | - name: slo-events-over-time-3d--interval-3m
36 | interval: 3m
37 | rules:
38 | - record: slo:events_over_time
39 | labels:
40 | slo_time_range: 3d
41 | expr:
42 | sum(
43 | increase(slo_domain_slo_class:slo_events_total[3d])
44 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
45 |
46 | - name: slo-events-over-time-1d--interval-3m
47 | interval: 3m
48 | rules:
49 | - record: slo:events_over_time
50 | labels:
51 | slo_time_range: 1d
52 | expr:
53 | sum(
54 | increase(slo_domain_slo_class:slo_events_total[1d])
55 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
56 |
57 | - name: slo-events-over-time-6h--interval-3m
58 | interval: 3m
59 | rules:
60 | - record: slo:events_over_time
61 | labels:
62 | slo_time_range: 6h
63 | expr:
64 | sum(
65 | increase(slo_domain_slo_class:slo_events_total[6h])
66 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
67 |
68 | - name: slo-events-over-time-2h--interval-3m
69 | interval: 3m
70 | rules:
71 | - record: slo:events_over_time
72 | labels:
73 | slo_time_range: 2h
74 | expr:
75 | sum(
76 | increase(slo_domain_slo_class:slo_events_total[2h])
77 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
78 |
79 | - name: slo-events-over-time--interval-1m
80 | interval: 1m
81 | rules:
82 | - record: slo:events_over_time
83 | labels:
84 | slo_time_range: 1h
85 | expr:
86 | sum(
87 | increase(slo_domain_slo_class:slo_events_total[1h])
88 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
89 | - record: slo:events_over_time
90 | labels:
91 | slo_time_range: 30m
92 | expr:
93 | sum(
94 | increase(slo_domain_slo_class:slo_events_total[30m])
95 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
96 | - record: slo:events_over_time
97 | labels:
98 | slo_time_range: 5m
99 | expr:
100 | sum(
101 | increase(slo_domain_slo_class:slo_events_total[5m])
102 | ) by (slo_class, slo_domain, slo_version, slo_type, result, namespace)
103 |
--------------------------------------------------------------------------------
/prometheus/recording_rules/rate-coefficient.yaml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: slo-events-rate-coefficient
3 | # every slo:events_rate_coefficient is multiplied by a constant which normalizes the difference of time_ranges
4 | # it is needed because slo:events_over_time is computed using increase rather than rate (in which case this wouldn't be needed)
5 | interval: 3m
6 | rules:
7 | - record: slo:events_rate_coefficient
8 | expr: |
9 | (sum(slo:events_over_time{slo_time_range="1h"}) without (result) * 24*28 )
10 | / on(slo_version, slo_domain, slo_class, slo_type, namespace) group_left()
11 | sum(slo:events_over_time{slo_time_range="4w"}) without (result)
12 | labels:
13 | slo_time_range: 1h
14 |
15 | - record: slo:events_rate_coefficient
16 | expr: |
17 | (sum(slo:events_over_time{slo_time_range="6h"}) without (result) * 4*28 )
18 | / on(slo_version, slo_domain, slo_class, slo_type, namespace) group_left()
19 | sum(slo:events_over_time{slo_time_range="4w"}) without (result)
20 | labels:
21 | slo_time_range: 6h
22 |
23 | - record: slo:events_rate_coefficient
24 | expr: |
25 | (sum(slo:events_over_time{slo_time_range="1d"}) without (result) * 28 )
26 | / on(slo_version, slo_domain, slo_class, slo_type, namespace) group_left()
27 | sum(slo:events_over_time{slo_time_range="4w"}) without (result)
28 | labels:
29 | slo_time_range: 1d
30 |
31 | - record: slo:events_rate_coefficient
32 | expr: |
33 | (sum(slo:events_over_time{slo_time_range="3d"}) without (result) * 28/3 )
34 | / on(slo_version, slo_domain, slo_class, slo_type, namespace) group_left()
35 | sum(slo:events_over_time{slo_time_range="4w"}) without (result)
36 | labels:
37 | slo_time_range: 3d
38 |
--------------------------------------------------------------------------------
/prometheus/recording_rules/slo_data_corrections.yaml:
--------------------------------------------------------------------------------
1 |
2 | # SLO data correction playbook: playbooks/blob/master/howto/slo-data-correction.md
3 |
4 | groups:
5 | - name: slo-data-corrections
6 | interval: 3m
7 | rules:
8 |
--------------------------------------------------------------------------------
/scripts/benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | output_folder=${1:-"profile"}
4 | packages="$(go list ./... | grep -v /vendor/ | xargs echo)"
5 | mkdir -p "$output_folder"
6 |
7 | for package_path in $packages; do
8 | package_name="$(basename "$package_path")"
9 | cpu_profile_file="${output_folder}/${package_name}_cpu.profile"
10 | memory_profile_file="${output_folder}/${package_name}_memory.profile"
11 | block_profile_file="${output_folder}/${package_name}_block.profile"
12 | go test \
13 | --benchmem \
14 | -cpuprofile="$cpu_profile_file" \
15 | -memprofile="$memory_profile_file" \
16 | -blockprofile="$block_profile_file" \
17 | -bench=. \
18 | -count 5 \
19 | "${package_path}"
20 | if [ -e "$cpu_profile_file" ]; then
21 | go tool pprof -png "$cpu_profile_file" >"${cpu_profile_file}.png"
22 | fi
23 | if [ -e "$memory_profile_file" ]; then
24 | go tool pprof -png "$memory_profile_file" >"${memory_profile_file}.png"
25 | fi
26 | if [ -e "$block_profile_file" ]; then
27 | go tool pprof -png "$block_profile_file" >"${block_profile_file}.png"
28 | fi
29 | done
30 |
--------------------------------------------------------------------------------
/scripts/generate_godoc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CURRENT_PKG=$(go list -m)
3 | INTERFACE=localhost:6060
4 |
5 | DST_DIR=${1:-"public/godoc"}
6 |
7 | # run a godoc server
8 | go get golang.org/x/tools/cmd/godoc
9 | godoc -http=$INTERFACE & DOC_PID=$!
10 |
11 | sleep 10
12 | # Wait for the server to start
13 | until curl -sSf "http://$INTERFACE/pkg/$CURRENT_PKG/" > /dev/null
14 | do
15 | sleep 1
16 | done
17 | sleep 1
18 |
19 | # recursive fetch entire web including CSS & JS
20 | # turn off robots check, otherwise might get blocked with details in `robots.txt` file
21 | # only get the directories we are looking for
22 | wget -r -p \
23 | -e robots=off \
24 | --include-directories="/lib/godoc,/pkg/$CURRENT_PKG,/src/$CURRENT_PKG" \
25 | --exclude-directories="/pkg/$CURRENT_PKG/vendor,/src/$CURRENT_PKG/vendor" \
26 | "http://$INTERFACE/pkg/$CURRENT_PKG/"
27 |
28 | # Stop the godoc server
29 | kill -9 $DOC_PID
30 |
31 | # all file will be generated into `localhost:6060` folder, hence we move them out from docker to local machine
32 | mkdir -p "$(dirname "$DST_DIR")"
33 | rm -rf "$DST_DIR"
34 | mv "$INTERFACE" "$DST_DIR"
35 | # replace relative links
36 | find "$DST_DIR" -name "*.html" -exec sed -Ei 's/\/(lib|src|pkg)\//\/slo-exporter\/godoc\/\1\//g' {} +
37 |
--------------------------------------------------------------------------------
/test/Test_MetricsInitialization/README.md:
--------------------------------------------------------------------------------
1 | # MetricsInitialization
2 |
3 | - Test whether all expected metrics have been properly initialized on single log line.
4 | For all of the aggregated metrics, we check that both possible results values have been exposed and that `le` is filled according to the domain configuration file.
5 |
6 | - There is also single log line which is supposed to be filtered based on provided status code. We test that by checking the total number of read lines.
7 | - The other single log line which gets processed hits the configured normalizer rule, so that endpoint name is transformed as configured.
8 |
--------------------------------------------------------------------------------
/test/Test_MetricsInitialization/classifications.csv:
--------------------------------------------------------------------------------
1 | testdomain,frontend-api,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/ppchit/rule/[0-9a-f]+$"
2 |
3 |
--------------------------------------------------------------------------------
/test/Test_MetricsInitialization/logs:
--------------------------------------------------------------------------------
1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" uct="0.000" uht="0.127" urt="0.127" cc="frontend-api" occ="-" url="532" ourl="-"
2 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 404 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" uct="0.000" uht="0.127" urt="0.127" cc="frontend-api" occ="-" url="532" ourl="-"
3 |
--------------------------------------------------------------------------------
/test/Test_MetricsInitialization/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 | webServerListenAddress: "0.0.0.0:8080"
2 |
3 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "dynamicClassifier", "statisticalClassifier", "sloEventProducer", "prometheusExporter"]
4 |
5 | modules:
6 |
7 | tailer:
8 | tailedFile: "./logs"
9 | follow: true
10 | reopen: true
11 | positionFile: ""
12 | positionPersistenceInterval: "2s"
13 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) \S+ \S+ \[(?P.*?)\] "\s*(?PGET|POST|HEAD|UPDATE|DELETE|PUT|CONNECT|OPTIONS|TRACE|PATCH)\s+(?P[^\s]+)\s+(?P[^\s]+)\s*" (?P\d+) \d+ "(?P.*?)" uag="(?P[^"]+)" "[^"]+" ua="[^"]+" rt="(?P\d+(\.\d+)??)"'
14 | emptyGroupRE: '^-$'
15 |
16 | relabel:
17 | eventRelabelConfigs:
18 | - source_labels: ["statusCode"]
19 | regex: "404"
20 | action: drop
21 |
22 | - source_labels: ["httpPath"]
23 | regex: "/api/v1/ppchit/rule/[0-9a-fA-F]{5,24}"
24 | action: replace
25 | target_label: "httpPath"
26 | replacement: "/api/v1/ppchit/rule/0"
27 |
28 | eventKeyGenerator:
29 | filedSeparator: ":"
30 | overrideExistingEventKey: false
31 | metadataKeys:
32 | - httpMethod
33 | - httpPath
34 |
35 | dynamicClassifier:
36 | exactMatchesCsvFiles: []
37 | regexpMatchesCsvFiles:
38 | - "./classifications.csv"
39 |
40 | statisticalClassifier:
41 | historyWindowSize: "30m"
42 | historyWeightUpdateInterval: "1m"
43 | defaultWeights:
44 | - weight: 1
45 | classification:
46 | sloDomain: "userportal"
47 | sloClass: "test1"
48 |
49 | sloEventProducer:
50 | rulesFiles:
51 | - "./slo_rules.yaml"
52 |
53 | prometheusExporter:
54 | metricName: "slo_events_total"
55 | maximumUniqueEventKeys: 1000
56 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
57 | labelNames:
58 | result: "result"
59 | sloDomain: "slo_domain"
60 | sloClass: "slo_class"
61 | sloApp: "slo_app"
62 | eventKey: "event_key"
63 |
--------------------------------------------------------------------------------
/test/Test_MetricsInitialization/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: testdo.*
4 | failure_conditions:
5 | - operator: numberIsHigherThan
6 | key: statusCode
7 | value: 499
8 | additional_metadata:
9 | slo_type: availability
10 | slo_version: 1
11 |
12 | - slo_matcher:
13 | domain: testdomain
14 | class: high_fast
15 | failure_conditions:
16 | - operator: numberIsHigherThan
17 | key: requestDuration
18 | value: 8
19 | additional_metadata:
20 | slo_version: 1
21 | slo_type: latency90
22 | percentile: 90
23 | le: 8.0
24 |
25 | - slo_matcher:
26 | domain: testdomain
27 | class: high_fast
28 | failure_conditions:
29 | - operator: numberIsHigherThan
30 | key: requestDuration
31 | value: 16
32 | additional_metadata:
33 | slo_version: 1
34 | slo_type: latency99
35 | percentile: 99
36 | le: 16.0
37 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/README.md:
--------------------------------------------------------------------------------
1 | # SloHeaders
2 | - Test that SLO classification as provided within log line is correctly propagated to created SLO event
3 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/classifications.csv:
--------------------------------------------------------------------------------
1 | testdomain,frontend-api,high_fast,"^ppchit_rule$"
2 |
3 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/logs:
--------------------------------------------------------------------------------
1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="userportal" slo-app="frontend-api" slo-class="critical" slo-endpoint="ppchit_rule" slo-result="success"
2 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/metrics:
--------------------------------------------------------------------------------
1 | # HELP slo_domain_slo_class_slo_app_event_key:slo_events_total Total number of SLO events exported with it's result and metadata.
2 | # TYPE slo_domain_slo_class_slo_app_event_key:slo_events_total counter
3 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",result="fail",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 0
4 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",result="success",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 1
5 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",le="16.0",percentile="99",result="fail",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="latency99",slo_version="1"} 0
6 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",le="16.0",percentile="99",result="success",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="latency99",slo_version="1"} 1
7 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",le="8.0",percentile="90",result="fail",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="latency90",slo_version="1"} 0
8 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="ppchit_rule",le="8.0",percentile="90",result="success",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="latency90",slo_version="1"} 1
9 | # HELP slo_exporter_tailer_lines_read_total Total number of lines tailed from the file.
10 | # TYPE slo_exporter_tailer_lines_read_total counter
11 | slo_exporter_tailer_lines_read_total 1
12 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 |
2 | webServerListenAddress: "0.0.0.0:8080"
3 |
4 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "metadataClassifier", "dynamicClassifier", "sloEventProducer", "prometheusExporter"]
5 |
6 | modules:
7 |
8 | tailer:
9 | tailedFile: "./logs"
10 | follow: true
11 | reopen: true
12 | positionFile: ""
13 | positionPersistenceInterval: "2s"
14 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) \S+ \S+ \[(?P.*?)\] "\s*(?PGET|POST|HEAD|UPDATE|DELETE|PUT|CONNECT|OPTIONS|TRACE|PATCH)\s+(?P[^\s]+)\s+(?P[^\s]+)\s*" (?P\d+) \d+ "(?P.*?)" uag="(?P[^"]+)" "[^"]+" ua="[^"]+" rt="(?P\d+(\.\d+)??)" frpc-status="(?P[^"]*)" slo-domain="(?P[^"]*)" slo-app="(?P[^"]*)" slo-class="(?P[^"]*)" slo-endpoint="(?P[^"]*)" slo-result="(?P[^"]*)"'
15 | emptyGroupRE: '^-$'
16 |
17 | relabel:
18 | eventRelabelConfigs:
19 | - source_labels: ["statusCode"]
20 | regex: "404"
21 | action: drop
22 |
23 | - source_labels: ["httpPath"]
24 | regex: "/api/v1/ppchit/rule/[0-9a-fA-F]{5,24}"
25 | target_label: "httpPath"
26 | replacement: "/api/v1/ppchit/rule/0"
27 |
28 | - source_labels: ["eventKey","httpMethod","httpPath"]
29 | separator: ":"
30 | regex: ":(.*)"
31 | target_label: "eventKey"
32 | replacement: "$1"
33 |
34 | eventKeyGenerator:
35 | filedSeparator: ":"
36 | overrideExistingEventKey: false
37 | metadataKeys:
38 | - eventKey
39 |
40 | metadataClassifier:
41 | sloDomainMetadataKey: "sloDomain"
42 | sloClassMetadataKey: "sloClass"
43 | sloAppMetadataKey: "sloApp"
44 | overrideExistingValues: true
45 |
46 | dynamicClassifier:
47 | exactMatchesCsvFiles: []
48 | regexpMatchesCsvFiles:
49 | - "./classifications.csv"
50 |
51 | sloEventProducer:
52 | rulesFiles:
53 | - "./slo_rules.yaml"
54 |
55 | prometheusExporter:
56 | metricName: "slo_events_total"
57 | maximumUniqueEventKeys: 1000
58 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
59 | labelNames:
60 | result: "result"
61 | sloDomain: "slo_domain"
62 | sloClass: "slo_class"
63 | sloApp: "slo_app"
64 | eventKey: "event_key"
65 |
--------------------------------------------------------------------------------
/test/Test_SloHeaders/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: userportal
4 | failure_conditions:
5 | - operator: numberIsHigherThan
6 | key: statusCode
7 | value: 499
8 | additional_metadata:
9 | slo_type: availability
10 | slo_version: 1
11 |
12 | - metadata_matcher:
13 | - operator: isMatchingRegexp
14 | key: requestDuration
15 | value: ".*"
16 | slo_matcher:
17 | domain: userportal
18 | class: critical
19 | failure_conditions:
20 | - operator: numberIsHigherThan
21 | key: requestDuration
22 | value: 8
23 | additional_metadata:
24 | slo_version: 1
25 | slo_type: latency90
26 | percentile: 90
27 | le: 8.0
28 |
29 | - slo_matcher:
30 | domain: userportal
31 | class: critical
32 | failure_conditions:
33 | - operator: numberIsHigherThan
34 | key: requestDuration
35 | value: 16
36 | additional_metadata:
37 | slo_version: 1
38 | slo_type: latency99
39 | percentile: 99
40 | le: 16.0
41 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/README.md:
--------------------------------------------------------------------------------
1 | # SloHeadersUpdateCache
2 |
3 | On 3 log lines, verify that
4 | - first log line which does not contain SLO classification information is classified according to the dynamic classifier initial config
5 | - second log line is classified according to the information which are contained within it
6 | - third line is classified according to the information within the previous log line, even though it does not bear any SLO classification information
7 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/classifications.csv:
--------------------------------------------------------------------------------
1 | userportal,frontend-api,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/ppchit/rule/[0-9a-f]+$"
2 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/logs:
--------------------------------------------------------------------------------
1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="-" slo-app="-" slo-class="-" slo-endpoint="-" slo-result="-"
2 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="userportal" slo-app="frontend-api" slo-class="critical" slo-endpoint="-" slo-result="success"
3 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="-" slo-app="-" slo-class="-" slo-endpoint="-" slo-result="-"
4 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/metrics:
--------------------------------------------------------------------------------
1 | # HELP slo_domain_slo_class_slo_app_event_key:slo_events_total Total number of SLO events exported with it's result and metadata.
2 | # TYPE slo_domain_slo_class_slo_app_event_key:slo_events_total counter
3 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="fail",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 0
4 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="success",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 2
5 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="fail",slo_app="frontend-api",slo_class="high_fast",slo_domain="userportal",slo_type="availability",slo_version="1"} 0
6 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="success",slo_app="frontend-api",slo_class="high_fast",slo_domain="userportal",slo_type="availability",slo_version="1"} 1
7 | # HELP slo_exporter_tailer_lines_read_total Total number of lines tailed from the file.
8 | # TYPE slo_exporter_tailer_lines_read_total counter
9 | slo_exporter_tailer_lines_read_total 3
10 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/slo_exporter.yaml:
--------------------------------------------------------------------------------
1 |
2 | webServerListenAddress: "0.0.0.0:8080"
3 |
4 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "metadataClassifier", "dynamicClassifier", "sloEventProducer", "prometheusExporter"]
5 |
6 | modules:
7 |
8 | tailer:
9 | tailedFile: "./logs"
10 | follow: true
11 | reopen: true
12 | positionFile: ""
13 | positionPersistenceInterval: "2s"
14 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) \S+ \S+ \[(?P.*?)\] "\s*(?PGET|POST|HEAD|UPDATE|DELETE|PUT|CONNECT|OPTIONS|TRACE|PATCH)\s+(?P[^\s]+)\s+(?P[^\s]+)\s*" (?P\d+) \d+ "(?P.*?)" uag="(?P[^"]+)" "[^"]+" ua="[^"]+" rt="(?P\d+(\.\d+)??)".* frpc-status="(?P[^"]*)" slo-domain="(?P[^"]*)" slo-app="(?P[^"]*)" slo-class="(?P[^"]*)" slo-endpoint="(?P[^"]*)" slo-result="(?P[^"]*)"'
15 | emptyGroupRE: '^-$'
16 |
17 | relabel:
18 | eventRelabelConfigs:
19 | - source_labels: ["statusCode"]
20 | regex: "404"
21 | action: drop
22 |
23 | - source_labels: ["httpPath"]
24 | regex: "/api/v1/ppchit/rule/[0-9a-fA-F]{5,24}"
25 | target_label: "httpPath"
26 | replacement: "/api/v1/ppchit/rule/0"
27 |
28 | eventKeyGenerator:
29 | filedSeparator: ":"
30 | overrideExistingEventKey: false
31 | metadataKeys:
32 | - httpMethod
33 | - httpPath
34 |
35 | metadataClassifier:
36 | sloDomainMetadataKey: "sloDomain"
37 | sloClassMetadataKey: "sloClass"
38 | sloAppMetadataKey: "sloApp"
39 | overrideExistingValues: true
40 |
41 | dynamicClassifier:
42 | exactMatchesCsvFiles: []
43 | regexpMatchesCsvFiles:
44 | - "./classifications.csv"
45 |
46 | sloEventProducer:
47 | rulesFiles:
48 | - "./slo_rules.yaml"
49 |
50 | prometheusExporter:
51 | metricName: "slo_events_total"
52 | maximumUniqueEventKeys: 1000
53 | ExceededKeyLimitPlaceholder: "cardinalityLimitExceeded"
54 | labelNames:
55 | result: "result"
56 | sloDomain: "slo_domain"
57 | sloClass: "slo_class"
58 | sloApp: "slo_app"
59 | eventKey: "event_key"
60 |
--------------------------------------------------------------------------------
/test/Test_SloHeadersUpdateCache/slo_rules.yaml:
--------------------------------------------------------------------------------
1 | rules:
2 | - slo_matcher:
3 | domain: userportal
4 | failure_conditions:
5 | - operator: numberIsHigherThan
6 | key: statusCode
7 | value: 499
8 | additional_metadata:
9 | slo_type: availability
10 | slo_version: 1
11 |
--------------------------------------------------------------------------------
/test/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eo pipefail
4 |
5 | # exit with $1 status code and printing $2 to stderr
6 | function myexit {
7 | echo "$2" > /dev/stderr
8 | exit $1
9 | }
10 |
11 | function cleanup {
12 | find -type f -name logs.pos | xargs -I{} rm '{}'
13 | find -type d -name ${TEST_RESULT_DIR} | xargs -I{} rm -rf '{}'
14 | }
15 |
16 | function get_metrics {
17 | curl -s ${METRICS_URL}
18 | }
19 |
20 | function evaluate_test_result {
21 | while read line ; do
22 | if ! grep -q "$line" ${TEST_RESULT_DIR}/${METRICS_FILENAME}; then
23 | echo " FAIL: Missing the following metric!"
24 | echo "$line"
25 | exit 1
26 | fi
27 | done < ${METRICS_FILENAME}
28 | echo " OK: found all expected metrics"
29 | }
30 |
31 | SCRIPT_DIR=$( dirname "$(readlink -f $0)" )
32 | SLO_EXPORTER="${SCRIPT_DIR}/../slo_exporter"
33 |
34 | TEST_DIR_PREFIX="Test_"
35 | TEST_RESULT_DIR="test_output"
36 |
37 | CONFIG_FILENAME="slo_exporter.yaml"
38 | METRICS_URL="http://localhost:8080/metrics"
39 | METRICS_FILENAME="metrics"
40 |
41 | SLO_EXPORTER_LOG_FILENAME="slo_exporter.log"
42 |
43 | cleanup
44 |
45 | for i_test in $(find "${SCRIPT_DIR}" -type d | grep ${TEST_DIR_PREFIX}) ; do
46 | echo "${i_test}"
47 |
48 | pushd ${i_test} > /dev/null
49 | mkdir ${TEST_RESULT_DIR}
50 | ${SLO_EXPORTER} --log-level=debug --config-file=${CONFIG_FILENAME} > ${TEST_RESULT_DIR}/${SLO_EXPORTER_LOG_FILENAME} 2>&1 &
51 | sleep 1
52 | # test whether SLO_EXPORTER is running in the background (did not exited during the initialization)
53 | [ -z "$(jobs %% | grep Running)" ] && \
54 | myexit 1 "${SLO_EXPORTER} is not running. Exiting..."
55 | get_metrics > ${TEST_RESULT_DIR}/${METRICS_FILENAME}
56 | # kill slo exporter test instance
57 | kill %%
58 |
59 | evaluate_test_result
60 | popd > /dev/null
61 | done
62 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/README.md:
--------------------------------------------------------------------------------
1 | slo-rules-generator is a tool which generates Prometheus recording rules based on input defintion of SLO domain. The generated rules are necessary for SLO based alerting - error budget exhaustion and burn-rate based alerts.
2 |
3 | See [./slo-domains.yaml.example](./slo-domains.yaml.example) for commented example of input configuration.
4 |
5 | ## Usage
6 |
7 | 1. Run `go version` to verify that you have `Go` installed - if not, refer to [golang website](https://golang.org/doc/install)
8 | 1. Build `slo-rules-generator` by running `go build .`
9 | 1. Run `slo-rules-generator` with `slo-domain.yaml.example` as an argument by running the following command:
10 | ```bash
11 | slo-rules-generator slo-domains.yaml.example
12 | ```
13 | ## Metrics in generated output
14 | ### slo:stable_version
15 | - used in order to link given domain to specific team (label `team`)
16 | - `enabled="true|false"` disable burn_rate, error budget alerts
17 | - `escalate` label documents first escalation level for the given domain
18 |
19 | Example:
20 | ```
21 | slo:stable_version{enabled="true", escalate="sre-team@company.org", namespace="test", slo_domain="example-domain", slo_version="1", team="example-team@company.org"}
22 | ```
23 | ### slo:violation_ratio_threshold
24 | - holds value of threshold for given `slo_version, slo_domain, slo_class, slo_type, namespace`
25 | - additional labels simplify values visualization in Grafana for latency-related SLO types - `percentile`, `le` (same as for `le` in Prometheus histograms, documents latency threshold)
26 |
27 | Example:
28 | ```
29 | slo:violation_ratio_threshold{le="0.6", namespace="test", percentile="90", slo_class="critical", slo_domain="example-domain", slo_type="latency90", slo_version="1"}
30 | 0.9
31 | slo:violation_ratio_threshold{le="12.0", namespace="test", percentile="99", slo_class="critical", slo_domain="example-domain", slo_type="latency99", slo_version="1"}
32 | 0.99
33 | slo:violation_ratio_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_type="availability", slo_version="1"}
34 | 0.9
35 | ```
36 | ### slo:burn_rate_threshold
37 | - modifier for slo:burn_rate based alerts' threshold
38 | - default values are usually reasonable, make sure you read chapters on SLO from [SRE workbook](https://sre.google/workbook/table-of-contents/) before even considering to change these
39 |
40 | Example:
41 | ```
42 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1d", slo_type="availability", slo_version="1"}
43 | 2.8
44 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1d", slo_type="latency90", slo_version="1"}
45 | 2.8
46 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1d", slo_type="latency99", slo_version="1"}
47 | 2.8
48 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1h", slo_type="availability", slo_version="1"}
49 | 13.44
50 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1h", slo_type="latency90", slo_version="1"}
51 | 13.44
52 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="1h", slo_type="latency99", slo_version="1"}
53 | 13.44
54 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="3d", slo_type="availability", slo_version="1"}
55 | 1
56 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="3d", slo_type="latency90", slo_version="1"}
57 | 1
58 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="3d", slo_type="latency99", slo_version="1"}
59 | 1
60 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="6h", slo_type="availability", slo_version="1"}
61 | 5.6
62 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="6h", slo_type="latency90", slo_version="1"}
63 | 5.6
64 | slo:burn_rate_threshold{namespace="test", slo_class="critical", slo_domain="example-domain", slo_time_range="6h", slo_type="latency99", slo_version="1"}
65 | 5.6
66 | ```
67 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/alerting.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "fmt"
4 |
5 | var (
6 | defaultTimerangeThresholds = []BurnRateThreshold{
7 | {
8 | Condition: Condition{TimeRange: "1h"},
9 | Value: 13.44,
10 | },
11 | {
12 | Condition: Condition{TimeRange: "6h"},
13 | Value: 5.6,
14 | },
15 | {
16 | Condition: Condition{TimeRange: "1d"},
17 | Value: 2.8,
18 | },
19 | {
20 | Condition: Condition{TimeRange: "3d"},
21 | Value: 1,
22 | },
23 | }
24 | )
25 |
26 | type Alerting struct{
27 | Team string
28 | Escalate string
29 | BurnRateThresholds []BurnRateThreshold `yaml:"burn_rate_thresholds"`
30 | }
31 |
32 | func (a Alerting) IsValid() []error {
33 | errs := []error{}
34 | for _, t := range a.BurnRateThresholds {
35 | if thresholdsErrs := t.IsValid(); len(thresholdsErrs) > 0 {
36 | errs = append(errs, thresholdsErrs...)
37 | }
38 | }
39 | return errs
40 | }
41 |
42 | type BurnRateThreshold struct{
43 | Condition Condition
44 | Value float32
45 | }
46 |
47 | // Returns subset of thresholds which matches given class and slo type
48 | func getMatchingSubset(thresholds []BurnRateThreshold, className, sloType string) []BurnRateThreshold {
49 | matchingBurnRateThresholds := []BurnRateThreshold{}
50 | for _, t := range thresholds {
51 | if t.Condition.Matches(className, sloType) {
52 | matchingBurnRateThresholds = append(matchingBurnRateThresholds, t)
53 | }
54 | }
55 | return matchingBurnRateThresholds
56 | }
57 |
58 | func (t BurnRateThreshold) IsValid() []error {
59 | errs := []error{}
60 | if t.Value <= 0 {
61 | errs = append(errs, fmt.Errorf("burn-rate treshold must be greater than 0"))
62 | }
63 | if err := t.Condition.IsValid(); err != nil {
64 | errs = append(errs, err)
65 | }
66 | return errs
67 | }
68 |
69 | type Condition struct {
70 | Class string
71 | Type string `yaml:"slo_type"`
72 | TimeRange BurnRateTimeRange `yaml:"time_range"`
73 | }
74 |
75 | func (c Condition) Matches(class, sloType string) bool {
76 | return (c.Class == "" || c.Class == class) && (c.Type == "" || c.Type == sloType)
77 | }
78 |
79 | func (c Condition) IsValid() error {
80 | // Class and Type needs to be checked at global context, here we just validate the timerange
81 | return c.TimeRange.IsValid()
82 | }
83 |
84 | type BurnRateTimeRange string
85 |
86 | func (t BurnRateTimeRange) IsValid() error {
87 | var found bool
88 | for _, burnRateTreshold := range defaultTimerangeThresholds {
89 | if burnRateTreshold.Condition.TimeRange == t {
90 | // given timerange matches one of timeranges in the default set
91 | found = true
92 | break
93 | }
94 | }
95 | if !found {
96 | return fmt.Errorf("invalid burn-rate timerange: %s.", string(t))
97 | }
98 | return nil
99 | }
100 |
101 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/all-in-one-example-domain.yaml:
--------------------------------------------------------------------------------
1 | # Configuration for slo-rules-generator tool.
2 | # If modified, run slo-rules-generator and move its output to recording_rules/
3 | # See ../../tools/slo-rules-generator/README.md for more information.
4 | example-domain:
5 | enabled: true
6 | namespace: test
7 | version: 1
8 | alerting:
9 | team: example-team@company.org
10 | escalate: sre-team@company.org
11 | classes:
12 | critical:
13 | availability: {slo_threshold: 0.90}
14 | latency90: {
15 | slo_threshold: 0.90,
16 | slo_threshold_metadata: {percentile: 90, le: 0.6}
17 | }
18 | latency99: {
19 | slo_threshold: 0.99,
20 | slo_threshold_metadata: {percentile: 99, le: 12.0}
21 | }
22 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/class.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 |
7 | "github.com/prometheus/prometheus/pkg/rulefmt"
8 | )
9 |
10 | const (
11 | classLabel = "slo_class"
12 | sloTypeLabel = "slo_type"
13 | )
14 |
15 | type Classes map[string]Class // Key is SloClass name
16 |
17 |
18 | // Returns a sorted list of classes names
19 | func (c Classes) Names() []string {
20 | res := []string{}
21 | for name,_ := range c {
22 | res = append(res, name)
23 | }
24 | sort.Strings(res)
25 | return res
26 | }
27 |
28 | type Class map[string]SloType // Key is SloType
29 |
30 | // Returns a sorted list of class' SLO types names
31 | func (c Class) Names() []string {
32 | res := []string{}
33 | for name,_ := range c {
34 | res = append(res, name)
35 | }
36 | sort.Strings(res)
37 | return res
38 | }
39 |
40 | func (c Class) IsValid() []error {
41 | errs := []error{}
42 | for sloTypeName, Threshold := range c {
43 | if err := Threshold.IsValid(); err != nil {
44 | errs = append(errs,fmt.Errorf("error validating '%s': %w", sloTypeName, err))
45 | }
46 | }
47 | return errs
48 | }
49 |
50 | // Returns SLO class representation as a list of Prometheus rules
51 | // If provided burnRateThresholds are nil, defaultTimerangeThresholds are used
52 | func (c Class) AsRules(className string, commonLabels Labels, burnRateThresholds []BurnRateThreshold) []rulefmt.RuleNode {
53 | rules := []rulefmt.RuleNode{}
54 | if burnRateThresholds == nil {
55 | burnRateThresholds = defaultTimerangeThresholds
56 | }
57 | commonLabels = commonLabels.Merge(Labels{classLabel: className})
58 | for _, sloTypeName := range c.Names() {
59 | burnRateThresholdsForType := getMatchingSubset(burnRateThresholds, className, sloTypeName)
60 | rules = append(rules,
61 | c[sloTypeName].AsRules(sloTypeName, commonLabels, burnRateThresholdsForType)...
62 | )
63 |
64 | }
65 | return rules
66 | }
67 |
68 | type SloType struct {
69 | Value float32 `yaml:"slo_threshold"`
70 | Metadata Labels `yaml:"slo_threshold_metadata"`
71 | }
72 |
73 | func (t SloType) IsValid() error {
74 | if t.Value < 0 || t.Value > 1 {
75 | return fmt.Errorf("slo threshold must be 0-1, not: %f", t.Value)
76 | }
77 | return nil
78 | }
79 |
80 | func (t SloType) AsRules(sloTypeName string, commonLabels Labels, burnRateThresholds []BurnRateThreshold) []rulefmt.RuleNode {
81 | rules := []rulefmt.RuleNode{}
82 | commonLabels = commonLabels.Merge(Labels{sloTypeLabel: sloTypeName})
83 |
84 | rules = append(rules,
85 | t.burnRateThresholdRules(commonLabels, burnRateThresholds)...
86 | )
87 | rules = append(rules, t.violationRatioThresholdRule(commonLabels))
88 | return rules
89 | }
90 |
91 | func (t SloType) violationRatioThresholdRule(commonLabels Labels) rulefmt.RuleNode {
92 | return rulefmt.RuleNode{
93 | Record: yamlStr("slo:violation_ratio_threshold"),
94 | Expr: yamlStr(fmt.Sprint(t.Value)),
95 | Labels: commonLabels.Merge(t.Metadata),
96 | }
97 | }
98 |
99 | func (t SloType) burnRateThresholdRules(commonLabels Labels, burnRateThresholds []BurnRateThreshold) []rulefmt.RuleNode {
100 | rules := []rulefmt.RuleNode{}
101 | for _, burnRateThreshold := range burnRateThresholds {
102 | rules = append(rules,
103 | rulefmt.RuleNode{
104 | Record: yamlStr("slo:burn_rate_threshold"),
105 | Expr: yamlStr(fmt.Sprint(burnRateThreshold.Value)),
106 | Labels: commonLabels.Merge(Labels{"slo_time_range": string(burnRateThreshold.Condition.TimeRange)}),
107 | })
108 | }
109 | return rules
110 | }
111 |
112 |
113 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/domain.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "time"
6 |
7 | "github.com/prometheus/common/model"
8 | "github.com/prometheus/prometheus/pkg/rulefmt"
9 | )
10 |
11 | const (
12 | domainLabel = "slo_domain"
13 | namespaceLabel = "namespace"
14 | versionLabel = "slo_version"
15 | enabledLabel = "enabled"
16 | teamLabel = "team"
17 | escalateLabel = "escalate"
18 | )
19 |
20 | type Domain struct {
21 | Name string `yaml:"domain"`
22 | Namespace string
23 | Enabled bool
24 | Version int
25 | Alerting Alerting
26 | Classes Classes
27 | }
28 |
29 | func (d Domain) DomainName(configName string) string {
30 | if d.Name != "" {
31 | return d.Name
32 | } else {
33 | return configName
34 | }
35 | }
36 |
37 | func (d Domain) AsRuleGroups(configName string) []rulefmt.RuleGroup {
38 | domainRulegroup := rulefmt.RuleGroup{
39 | Name: fmt.Sprintf("slo_v%d_slo_exporter_%s", d.Version, d.DomainName(configName)),
40 | Interval: model.Duration(4 * time.Minute),
41 | Rules: []rulefmt.RuleNode{},
42 | }
43 | domainRulegroup.Rules = append(domainRulegroup.Rules, d.stableVersionRule(d.DomainName(configName)))
44 | out := []rulefmt.RuleGroup{
45 | domainRulegroup,
46 | }
47 |
48 | for _, className := range d.Classes.Names() {
49 | domainClassRulegroup := rulefmt.RuleGroup{
50 | Name: fmt.Sprintf("slo_v%d_slo_exporter_%s_%s", d.Version, d.DomainName(configName), className),
51 | Interval: model.Duration(4 * time.Minute),
52 | Rules: []rulefmt.RuleNode{},
53 | }
54 | domainClassRulegroup.Rules = append(
55 | domainClassRulegroup.Rules,
56 | d.Classes[className].AsRules(className, d.commonLabels(d.DomainName(configName)), d.Alerting.BurnRateThresholds)...,
57 | )
58 | out = append(out, domainClassRulegroup)
59 | }
60 |
61 | return out
62 | }
63 |
64 | func (d Domain) commonLabels(domainName string) Labels {
65 | return Labels{
66 | domainLabel: domainName,
67 | versionLabel: fmt.Sprint(d.Version),
68 | namespaceLabel: fmt.Sprint(d.Namespace),
69 | }
70 | }
71 |
72 | func (d Domain) stableVersionRule(domainName string) rulefmt.RuleNode {
73 | labels := Labels{
74 | teamLabel: d.Alerting.Team,
75 | enabledLabel: fmt.Sprint(d.Enabled),
76 | }
77 | if d.Alerting.Escalate != "" {
78 | labels[escalateLabel] = d.Alerting.Escalate
79 | }
80 | return rulefmt.RuleNode{
81 | Record: yamlStr("slo:stable_version"),
82 | Expr: yamlStr("1"),
83 | Labels: d.commonLabels(domainName).Merge(labels),
84 | }
85 | }
86 |
87 | func (d Domain) IsValid() []error {
88 | errs := []error{}
89 | if err := d.Alerting.IsValid(); len(err) > 0 {
90 | errs = append(errs, fmt.Errorf("alerting validation failed: %v", err))
91 | }
92 | for className, classConf := range d.Classes {
93 | if err := classConf.IsValid(); len(err) > 0 {
94 | errs = append(errs, fmt.Errorf("class '%s' validation failed: %v", className, err))
95 | }
96 | }
97 | return append(errs, d.validateReferences()...)
98 | }
99 |
100 | // Validates whether classes and slo_types references in alerting..conditions are defined in classes section
101 | func (d Domain) validateReferences() []error {
102 | errs := []error{}
103 | for _, threshold := range d.Alerting.BurnRateThresholds {
104 | class := threshold.Condition.Class
105 | if class != "" {
106 | if _, ok := d.Classes[class]; !ok {
107 | errs = append(errs, fmt.Errorf("class '%s' referenced in condition not defined", class))
108 | }
109 | }
110 | if sloType := threshold.Condition.Type; sloType != "" {
111 | if class != "" {
112 | if _, typeFound := d.Classes[class][sloType]; !typeFound {
113 | errs = append(errs, fmt.Errorf("slo type '%s' referenced in condition not defined for class '%s'", sloType, class))
114 | }
115 | } else {
116 | sloTypeFound := false
117 | for _, class := range d.Classes {
118 | if _, ok := class[sloType]; ok {
119 | sloTypeFound = true
120 | break
121 | }
122 | }
123 | if !sloTypeFound {
124 | errs = append(errs, fmt.Errorf("slo type '%s' referenced in condition not defined in any class", sloType))
125 | }
126 | }
127 | }
128 | }
129 | return errs
130 | }
131 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/seznam/slo-exporter/tools/slo-rules-generator
2 |
3 | go 1.16
4 |
5 | require (
6 | github.com/prometheus/common v0.30.0
7 | // We fetch the exact revision because of issue described at https://github.com/prometheus/prometheus/issues/6048#issuecomment-534549253
8 | github.com/prometheus/prometheus v1.8.2-0.20210914090109-37468d88dce8
9 | github.com/stretchr/testify v1.7.0
10 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
11 | )
12 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/slo-domains.yaml.example:
--------------------------------------------------------------------------------
1 | userportal-reports-v6: # config-name
2 | domain: userportal-reports # fallbacks to config-name if not specified
3 | enabled: true
4 | namespace: production
5 | version: 6
6 | alerting:
7 | team: team.x@company.com
8 | escalate: sre.x@company.com
9 | # Thresholds for burn-rate alerts. If not present, the following defaults are used for all domains, classes:
10 | # 1h: 13.44
11 | # 6h: 5.6
12 | # 1d: 2.8
13 | # 3d: 1
14 | # class and slo_type in conditions may be an empty string - which equals to effectively any domain or slo_type
15 | burn_rate_thresholds:
16 | - condition:
17 | class: 'critical'
18 | slo_type: 'availability'
19 | time_range: '1h'
20 | value: 13.44
21 | - condition:
22 | class: 'low'
23 | slo_type: 'latency90'
24 | time_range: '6h'
25 | value: 33.12
26 | classes:
27 | critical: # slo_class name
28 | availability:
29 | slo_threshold: 0.99
30 | latency90:
31 | slo_threshold: 0.90,
32 | # labels added to resulting slo:violation_ratio_threshold metrics
33 | slo_threshold_metadata:
34 | percentile: 90
35 | le: 0.8 # The 0.8 is a maximum duration of event to be considered successfull taken from the slo_rules.yaml
36 | low:
37 | availability:
38 | slo_threshold: 0.99
39 | latency90:
40 | slo_threshold: 0.90
41 | slo_threshold_metadata:
42 | percentile: 90
43 | le: 0.8
44 |
--------------------------------------------------------------------------------
/tools/slo-rules-generator/slo-rules-generator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/prometheus/prometheus/pkg/rulefmt"
8 | "gopkg.in/yaml.v3"
9 | )
10 |
11 | const (
12 | domainFileHeaderComment = `#
13 | # This file has been generated by slo-rules-generator.
14 | # DO NOT EDIT MANUALLY!
15 | #
16 | `
17 | )
18 |
19 | func yamlStr(s string) yaml.Node {
20 | n := yaml.Node{}
21 | n.SetString(s)
22 | return n
23 | }
24 |
25 | type Labels map[string]string
26 |
27 | func (l Labels) Merge(with Labels) Labels {
28 | out := Labels{}
29 | for k, v := range l {
30 | out[k] = v
31 | }
32 | for k, v := range with {
33 | out[k] = v
34 | }
35 | return out
36 | }
37 |
38 | type SloConfiguration map[string]Domain
39 |
40 | func main() {
41 | if len(os.Args) != 2 {
42 | fmt.Printf("Usage: %s ", os.Args[0])
43 | os.Exit(1)
44 | }
45 | confFilename := os.Args[1]
46 | data, err := os.ReadFile(confFilename)
47 | if err != nil {
48 | fmt.Printf("Unable read file '%s': %v", confFilename, err)
49 | os.Exit(2)
50 | }
51 | conf := SloConfiguration{}
52 | err = yaml.Unmarshal(data, &conf)
53 | if err != nil {
54 | fmt.Printf("Unable to parse input configuration: %s", err.Error())
55 | os.Exit(2)
56 | }
57 | for configName, domainConf := range conf {
58 | if errs := domainConf.IsValid(); len(errs) > 0 {
59 | fmt.Printf("Error while validating '%s' configuration:\n%v", configName, errs)
60 | os.Exit(2)
61 | }
62 | domainGroups := rulefmt.RuleGroups{
63 | Groups: domainConf.AsRuleGroups(configName),
64 | }
65 | data, err := yaml.Marshal(domainGroups)
66 | if err != nil {
67 | fmt.Printf("Unable to marshall %s: %v", configName, err)
68 | os.Exit(2)
69 | }
70 | fname := fmt.Sprintf("%s.yaml", configName)
71 | f, err := os.Create(fname)
72 | if err != nil {
73 | fmt.Printf("Error while creating file %s: %v", fname, err)
74 | os.Exit(1)
75 | }
76 | fmt.Printf("-> %s\n", fname)
77 | defer f.Close()
78 | _, err = f.WriteString(domainFileHeaderComment)
79 | if err != nil {
80 | fmt.Printf("Error while writing to %s: %v", fname, err)
81 | os.Exit(1)
82 | }
83 | _, err = f.Write(data)
84 | if err != nil {
85 | fmt.Printf("Error while writing to %s: %v", fname, err)
86 | os.Exit(1)
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------