├── examples ├── all_in_one │ ├── prometheus │ │ ├── alerts │ │ └── recording_rules │ ├── example-domain-slo-conf.yaml │ ├── slo-exporter │ │ └── conf │ │ │ ├── classification.csv │ │ │ ├── slo_rules.yaml │ │ │ └── slo_exporter.yaml │ ├── grafana │ │ └── provisioning │ │ │ ├── dashboards │ │ │ └── dashboard.yml │ │ │ └── datasources │ │ │ └── datasource.yml │ ├── nginx │ │ └── conf │ │ │ └── nginx.conf │ ├── README.md │ └── docker-compose.yaml ├── prometheus │ ├── regexp_events_classification.csv │ ├── exact_events_classification.csv │ ├── slo_rules.yaml │ ├── slo_exporter.yaml │ └── README.md ├── README.md ├── nginx_proxy │ ├── classification.csv │ ├── slo_rules.yaml │ ├── README.md │ └── slo_exporter.yaml ├── kafka │ ├── slo-exporter │ │ ├── slo_rules.yaml │ │ └── slo_exporter.yaml │ ├── README.md │ └── docker-compose.yaml └── envoy_proxy │ ├── docker-compose.yaml │ ├── slo-exporter │ ├── slo_rules.yaml │ └── slo_exporter.yaml │ ├── README.md │ └── envoy │ └── envoy.yaml ├── .gitignore ├── pkg ├── dynamic_classifier │ ├── testdata │ │ ├── TestMatcherRegexpDumpCSV.golden │ │ ├── TestLoadRegexpMatchesFromMultipleCSV.golden │ │ ├── TestMatcherExactDumpCSV.golden │ │ ├── TestLoadExactMatchesFromMultipleCSV.golden │ │ ├── Test_DynamicClassifier_Classify_OverridesCacheFromConfig.golden │ │ ├── TestClassificationByExactMatches.golden │ │ └── TestClassificationByRegexpMatches.golden │ ├── matcher.go │ ├── memory_exact_matcher.go │ ├── matcher_test.go │ └── regexp_matcher.go ├── slo_event_producer │ ├── testdata │ │ ├── slo_rules_invalid.yaml.golden │ │ └── slo_rules_valid.yaml.golden │ ├── config.go │ └── config_test.go ├── storage │ ├── interfaces.go │ ├── in_memory.go │ ├── capped_container_test.go │ └── container_test.go ├── envoy_access_log_server │ └── util.go ├── prometheus_ingester │ ├── headerRoundTripper.go │ ├── query_executor_test.go │ └── headerRoundTripper_test.go ├── event │ ├── slo_classification.go │ ├── slo.go │ └── raw.go ├── prober │ ├── prober_test.go │ └── prober.go ├── pipeline │ ├── module_test.go │ └── module.go ├── config │ └── config.go ├── event_key_generator │ └── event_key_generator_test.go ├── event_metadata_renamer │ └── renamer_test.go ├── stringmap │ └── stringmap_benchmark_test.go ├── prometheus_exporter │ ├── aggregating_counter_test.go │ └── exemplars.go ├── metadata_classifier │ └── metadata_cassifier_test.go └── relabel │ ├── relabel_test.go │ └── relabel.go ├── test ├── Test_SloHeaders │ ├── classifications.csv │ ├── README.md │ ├── logs │ ├── slo_rules.yaml │ ├── metrics │ └── slo_exporter.yaml ├── Test_SloHeadersUpdateCache │ ├── classifications.csv │ ├── slo_rules.yaml │ ├── README.md │ ├── metrics │ ├── logs │ └── slo_exporter.yaml ├── Test_MetricsInitialization │ ├── classifications.csv │ ├── README.md │ ├── logs │ ├── slo_rules.yaml │ └── slo_exporter.yaml └── run_tests.sh ├── grafana_dashboards ├── SLO_detailed.json ├── slo_exporter.json ├── SLO_drilldown.json ├── SLO_domains_overview.json └── README.md ├── prometheus ├── recording_rules │ ├── slo_data_corrections.yaml │ ├── error-budget.yaml │ ├── burn-rate.yaml │ ├── rate-coefficient.yaml │ └── events-over-time.yaml └── alerts │ ├── slo_data_corrections.yaml │ ├── error-budget.yaml │ ├── missing_all_data.yaml │ ├── missing_data.yaml │ └── slo_exporter_alerts.yaml ├── Dockerfile ├── kubernetes ├── slo-exporter-service.yaml ├── README.md ├── slo-exporter-statefulset.yaml └── slo-exporter-configmap.yaml ├── tools └── slo-rules-generator │ ├── go.mod │ ├── all-in-one-example-domain.yaml │ ├── slo-domains.yaml.example │ ├── slo-rules-generator.go │ ├── alerting.go │ ├── class.go │ ├── domain.go │ └── README.md ├── docs ├── modules │ ├── event_metadata_renamer.md │ ├── relabel.md │ ├── metadata_classifier.md │ ├── event_key_generator.md │ ├── statistical_classifier.md │ ├── tailer.md │ ├── kafka_ingester.md │ ├── dynamic_classifier.md │ └── prometheus_exporter.md ├── operating.md ├── architecture.md ├── glossary.md └── configuration.md ├── scripts ├── benchmark.sh └── generate_godoc.sh ├── .golangci.yaml ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── CODE_OF_CONDUCT.md ├── .circleci └── config.yml ├── CONTRIBUTING.md ├── Makefile ├── .goreleaser.yml └── go.mod /examples/all_in_one/prometheus/alerts: -------------------------------------------------------------------------------- 1 | ../../../prometheus/alerts/ -------------------------------------------------------------------------------- /examples/all_in_one/prometheus/recording_rules: -------------------------------------------------------------------------------- 1 | ../../../prometheus/recording_rules/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | 3 | slo_exporter 4 | dist 5 | tmp 6 | 7 | **/*.pos 8 | **/test_output/ 9 | -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestMatcherRegexpDumpCSV.golden: -------------------------------------------------------------------------------- 1 | test-domain,test-app,test-class,.* 2 | -------------------------------------------------------------------------------- /test/Test_SloHeaders/classifications.csv: -------------------------------------------------------------------------------- 1 | testdomain,frontend-api,high_fast,"^ppchit_rule$" 2 | 3 | -------------------------------------------------------------------------------- /grafana_dashboards/SLO_detailed.json: -------------------------------------------------------------------------------- 1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_detailed.json -------------------------------------------------------------------------------- /grafana_dashboards/slo_exporter.json: -------------------------------------------------------------------------------- 1 | ../examples/all_in_one/grafana/provisioning/dashboards/slo_exporter.json -------------------------------------------------------------------------------- /examples/all_in_one/example-domain-slo-conf.yaml: -------------------------------------------------------------------------------- 1 | ../../tools/slo-rules-generator/all-in-one-example-domain.yaml -------------------------------------------------------------------------------- /grafana_dashboards/SLO_drilldown.json: -------------------------------------------------------------------------------- 1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_drilldown.json -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestLoadRegexpMatchesFromMultipleCSV.golden: -------------------------------------------------------------------------------- 1 | test-domain,test-app,test-class,".*" -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestMatcherExactDumpCSV.golden: -------------------------------------------------------------------------------- 1 | test-domain,test-app,test-class,test-endpoint 2 | -------------------------------------------------------------------------------- /grafana_dashboards/SLO_domains_overview.json: -------------------------------------------------------------------------------- 1 | ../examples/all_in_one/grafana/provisioning/dashboards/SLO_domains_overview.json -------------------------------------------------------------------------------- /examples/all_in_one/slo-exporter/conf/classification.csv: -------------------------------------------------------------------------------- 1 | example-domain,example-app,critical,"^(GET|POST|HEAD|PUT|DELETE):.*" 2 | -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestLoadExactMatchesFromMultipleCSV.golden: -------------------------------------------------------------------------------- 1 | test-domain,test-app,test-class,"GET:/testing-endpoint" -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/Test_DynamicClassifier_Classify_OverridesCacheFromConfig.golden: -------------------------------------------------------------------------------- 1 | domain,app,class,GET:/testing-endpoint 2 | -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestClassificationByExactMatches.golden: -------------------------------------------------------------------------------- 1 | # Test comment 2 | test-domain,test-app,test-class,"GET:/testing-endpoint" 3 | -------------------------------------------------------------------------------- /test/Test_SloHeadersUpdateCache/classifications.csv: -------------------------------------------------------------------------------- 1 | userportal,frontend-api,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/ppchit/rule/[0-9a-f]+$" 2 | -------------------------------------------------------------------------------- /test/Test_MetricsInitialization/classifications.csv: -------------------------------------------------------------------------------- 1 | testdomain,frontend-api,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/ppchit/rule/[0-9a-f]+$" 2 | 3 | -------------------------------------------------------------------------------- /test/Test_SloHeaders/README.md: -------------------------------------------------------------------------------- 1 | # SloHeaders 2 | - Test that SLO classification as provided within log line is correctly propagated to created SLO event 3 | -------------------------------------------------------------------------------- /grafana_dashboards/README.md: -------------------------------------------------------------------------------- 1 | # Grafana dashboards 2 | ### [slo-exporter dashboard](./slo_exporter.json) 3 | Dashboard visualising application metrics of slo-exporter itself. 4 | -------------------------------------------------------------------------------- /pkg/dynamic_classifier/testdata/TestClassificationByRegexpMatches.golden: -------------------------------------------------------------------------------- 1 | # Test comment 2 | test-domain,test-app,test-class,"/api/test/.*" 3 | test-domain,test-app,test-class-all,"/api/.*" 4 | -------------------------------------------------------------------------------- /examples/prometheus/regexp_events_classification.csv: -------------------------------------------------------------------------------- 1 | # Endpoints which should be highly available but latency can be higher depending on amount of requested data. 2 | api,prometheus,high_slow,"/api/.*" 3 | -------------------------------------------------------------------------------- /pkg/slo_event_producer/testdata/slo_rules_invalid.yaml.golden: -------------------------------------------------------------------------------- 1 | rules: 2 | - failure_condddddddditions: 3 | - operator: numberIsHigherThan 4 | key: "statusCode" 5 | value: 500 6 | hahahahaha: 7 | -------------------------------------------------------------------------------- /prometheus/recording_rules/slo_data_corrections.yaml: -------------------------------------------------------------------------------- 1 | 2 | # SLO data correction playbook: playbooks/blob/master/howto/slo-data-correction.md 3 | 4 | groups: 5 | - name: slo-data-corrections 6 | interval: 3m 7 | rules: 8 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | Here you can find example configurations of slo-exporter for distinct use-cases. 3 | Each example should have its own README.md with description what the example does and how the slo-exporter is actually configured. 4 | -------------------------------------------------------------------------------- /pkg/storage/interfaces.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | type Container interface { 4 | Add(item interface{}) 5 | Stream() <-chan interface{} 6 | Len() int 7 | } 8 | 9 | type CappedContainer interface { 10 | Container 11 | Capacity() int 12 | } 13 | -------------------------------------------------------------------------------- /examples/nginx_proxy/classification.csv: -------------------------------------------------------------------------------- 1 | test-domain,test-app,high_fast,"^(GET|POST|HEAD|PUT|DELETE):/api/v1/.*$" 2 | test-domain,test-app,critical,"^(GET|POST|HEAD|PUT|DELETE):/api/v2/.*$" 3 | test-domain,test-app,critical,"^(GET|POST|HEAD|PUT|DELETE):/img2/.*$" 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stable-slim 2 | 3 | RUN apt-get update && apt-get install ca-certificates -y && apt-get clean 4 | 5 | COPY slo_exporter /slo_exporter/ 6 | COPY Dockerfile / 7 | 8 | WORKDIR /slo_exporter 9 | 10 | ENTRYPOINT ["/slo_exporter/slo_exporter"] 11 | 12 | CMD ["--help"] 13 | -------------------------------------------------------------------------------- /examples/all_in_one/grafana/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'Prometheus' 5 | orgId: 1 6 | folder: 'SLO' 7 | type: file 8 | disableDeletion: false 9 | editable: true 10 | options: 11 | path: /etc/grafana/provisioning/dashboards 12 | -------------------------------------------------------------------------------- /pkg/slo_event_producer/testdata/slo_rules_valid.yaml.golden: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: domain 4 | failure_conditions: 5 | - operator: numberIsHigherThan 6 | key: "statusCode" 7 | value: 500 8 | additional_metadata: 9 | slo_type: availability 10 | -------------------------------------------------------------------------------- /kubernetes/slo-exporter-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: slo-exporter 5 | spec: 6 | type: ClusterIP 7 | ports: 8 | - name: grpc-logging 9 | port: 18090 10 | protocol: TCP 11 | targetPort: 18090 12 | selector: 13 | app: slo-exporter 14 | -------------------------------------------------------------------------------- /test/Test_SloHeadersUpdateCache/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: userportal 4 | failure_conditions: 5 | - operator: numberIsHigherThan 6 | key: statusCode 7 | value: 499 8 | additional_metadata: 9 | slo_type: availability 10 | slo_version: 1 11 | -------------------------------------------------------------------------------- /pkg/dynamic_classifier/matcher.go: -------------------------------------------------------------------------------- 1 | package dynamic_classifier 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/seznam/slo-exporter/pkg/event" 7 | ) 8 | 9 | type matcherType string 10 | 11 | type matcher interface { 12 | getType() matcherType 13 | set(key string, classification *event.SloClassification) error 14 | get(key string) (*event.SloClassification, error) 15 | dumpCSV(w io.Writer) error 16 | } 17 | -------------------------------------------------------------------------------- /test/Test_SloHeadersUpdateCache/README.md: -------------------------------------------------------------------------------- 1 | # SloHeadersUpdateCache 2 | 3 | On 3 log lines, verify that 4 | - first log line which does not contain SLO classification information is classified according to the dynamic classifier initial config 5 | - second log line is classified according to the information which are contained within it 6 | - third line is classified according to the information within the previous log line, even though it does not bear any SLO classification information 7 | -------------------------------------------------------------------------------- /test/Test_SloHeaders/logs: -------------------------------------------------------------------------------- 1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="userportal" slo-app="frontend-api" slo-class="critical" slo-endpoint="ppchit_rule" slo-result="success" 2 | -------------------------------------------------------------------------------- /tools/slo-rules-generator/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/seznam/slo-exporter/tools/slo-rules-generator 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/prometheus/common v0.30.0 7 | // We fetch the exact revision because of issue described at https://github.com/prometheus/prometheus/issues/6048#issuecomment-534549253 8 | github.com/prometheus/prometheus v1.8.2-0.20210914090109-37468d88dce8 9 | github.com/stretchr/testify v1.7.0 10 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b 11 | ) 12 | -------------------------------------------------------------------------------- /pkg/envoy_access_log_server/util.go: -------------------------------------------------------------------------------- 1 | package envoy_access_log_server 2 | 3 | import ( 4 | "fmt" 5 | 6 | pbduration "github.com/golang/protobuf/ptypes/duration" 7 | ) 8 | 9 | // Returns deterministic string representation of the given duration - ns. 10 | func pbDurationDeterministicString(d *pbduration.Duration) (string, error) { 11 | if d == nil { 12 | return "", fmt.Errorf(" duration given") 13 | } 14 | if !d.IsValid() { 15 | return "", fmt.Errorf("invalid duration given: %s", d) 16 | } 17 | return fmt.Sprint(d.AsDuration().Nanoseconds()) + "ns", nil 18 | } 19 | -------------------------------------------------------------------------------- /examples/kafka/slo-exporter/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - failure_conditions: 3 | - key: result 4 | operator: isNotEqualTo 5 | value: "success" 6 | additional_metadata: 7 | slo_type: availability 8 | slo_version: 1 9 | 10 | # Mark event as failed for slo_type: quality if any of the observed quality degradations occurred 11 | - failure_conditions: 12 | - key: degradation_slave_response 13 | operator: numberIsHigherThan 14 | value: 0 15 | additional_metadata: 16 | slo_type: quality 17 | slo_version: 1 18 | -------------------------------------------------------------------------------- /examples/kafka/slo-exporter/slo_exporter.yaml: -------------------------------------------------------------------------------- 1 | webServerListenAddress: "0.0.0.0:8080" 2 | 3 | pipeline: ["kafkaIngester", "eventKeyGenerator", "sloEventProducer", "prometheusExporter"] 4 | 5 | modules: 6 | kafkaIngester: 7 | brokers: 8 | - "kafka-1:9092" 9 | - "kafka-2:9092" 10 | - "kafka-3:9092" 11 | topic: slo-exporter 12 | groupId: slo-exporter 13 | logKafkaEvents: true 14 | 15 | eventKeyGenerator: 16 | metadataKeys: 17 | - "name" 18 | 19 | sloEventProducer: 20 | rulesFiles: 21 | - "slo_rules.yaml" 22 | 23 | prometheusExporter: {} 24 | -------------------------------------------------------------------------------- /prometheus/recording_rules/error-budget.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | # Used for simple vizualizations in Grafana. 3 | - name: slo-error-budget 4 | interval: 1m 5 | rules: 6 | - record: slo:error_budget 7 | expr: | 8 | slo:violation_ratio{slo_time_range="4w"} 9 | * on (slo_domain,slo_version, namespace) group_left() 10 | max(slo:stable_version) by (slo_class,slo_domain,slo_version, slo_type, namespace) 11 | / on (slo_class,slo_domain,slo_version, slo_type, namespace) group_left () 12 | (slo:violation_ratio_threshold - 1) 13 | + 1 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /test/Test_MetricsInitialization/README.md: -------------------------------------------------------------------------------- 1 | # MetricsInitialization 2 | 3 | - Test whether all expected metrics have been properly initialized on single log line. 4 | For all of the aggregated metrics, we check that both possible results values have been exposed and that `le` is filled according to the domain configuration file. 5 | 6 | - There is also single log line which is supposed to be filtered based on provided status code. We test that by checking the total number of read lines. 7 | - The other single log line which gets processed hits the configured normalizer rule, so that endpoint name is transformed as configured. 8 | -------------------------------------------------------------------------------- /pkg/prometheus_ingester/headerRoundTripper.go: -------------------------------------------------------------------------------- 1 | package prometheus_ingester 2 | 3 | import "net/http" 4 | 5 | type httpHeadersRoundTripper struct { 6 | headers map[string]string 7 | roudTripper http.RoundTripper 8 | } 9 | 10 | // RoundTrip implements the http.RoundTripper interface. 11 | func (h httpHeadersRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) { 12 | // We use RoundTripper to inject HTTP headers even though it is not advised, 13 | // but the Prometheus client does not allow us to do it otherwise. 14 | for k, v := range h.headers { 15 | r.Header.Set(k, v) 16 | } 17 | 18 | return h.roudTripper.RoundTrip(r) 19 | } 20 | -------------------------------------------------------------------------------- /examples/nginx_proxy/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: test-domain 4 | failure_conditions: 5 | - operator: numberIsEqualOrHigherThan 6 | key: statusCode 7 | value: 500 8 | - operator: isMatchingRegexp 9 | key: sloResult 10 | value: "[fF]ail(ure)?" 11 | additional_metadata: 12 | slo_type: availability 13 | slo_version: 6 14 | 15 | - slo_matcher: 16 | domain: test-domain 17 | class: critical 18 | failure_conditions: 19 | - operator: numberIsHigherThan 20 | key: requestDuration 21 | value: 0.8 22 | additional_metadata: 23 | slo_version: 6 24 | slo_type: latency90 25 | percentile: 90 26 | le: 0.8 27 | -------------------------------------------------------------------------------- /kubernetes/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes manifests 2 | 3 | These serve as an example. You will probably want to use kustomize or some other configuration/deployment tool. 4 | 5 | ## Workload type 6 | 7 | We recommend using a StatefulSet instead of deployment to mitigate high churn in long term metrics - statefulset' stable pod name format (which is propagated to Prometheus' instance label) makes it easier for Prometheus to calculate the SLO over long time periods. 8 | 9 | ## Configuration 10 | 11 | We recommend to either building own docker image based on the upstream one with the configuration baked in or including configuration as a versioned configmap(s), in order to simplify rollbacks. 12 | 13 | In this example we use it without the versioning just for simplification. 14 | 15 | -------------------------------------------------------------------------------- /tools/slo-rules-generator/all-in-one-example-domain.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for slo-rules-generator tool. 2 | # If modified, run slo-rules-generator and move its output to recording_rules/ 3 | # See ../../tools/slo-rules-generator/README.md for more information. 4 | example-domain: 5 | enabled: true 6 | namespace: test 7 | version: 1 8 | alerting: 9 | team: example-team@company.org 10 | escalate: sre-team@company.org 11 | classes: 12 | critical: 13 | availability: {slo_threshold: 0.90} 14 | latency90: { 15 | slo_threshold: 0.90, 16 | slo_threshold_metadata: {percentile: 90, le: 0.6} 17 | } 18 | latency99: { 19 | slo_threshold: 0.99, 20 | slo_threshold_metadata: {percentile: 99, le: 12.0} 21 | } 22 | -------------------------------------------------------------------------------- /docs/modules/event_metadata_renamer.md: -------------------------------------------------------------------------------- 1 | # Event metadata renamer 2 | 3 | *Module status is _experimental_, it may be modified or removed even in non-major release.* 4 | 5 | | | | 6 | |----------------|------------------------| 7 | | `moduleName` | `eventMetadataRenamer` | 8 | | Module type | `processor` | 9 | | Input event | `raw` | 10 | | Output event | `raw` | 11 | 12 | This module allows you to modify the event metadata by renaming its keys. Refusals of overriding an already existing _destination_ are reported as a Warning log as well as within exposed Prometheus' metric. 13 | 14 | `moduleConfig` 15 | ```yaml 16 | eventMetadataRenamerConfigs: 17 | - source: keyX 18 | destination: keyY 19 | ``` 20 | -------------------------------------------------------------------------------- /examples/prometheus/exact_events_classification.csv: -------------------------------------------------------------------------------- 1 | # Critical endpoints which should be fast and highly available. 2 | ui,prometheus,critical,"/" 3 | ui,prometheus,critical,"/graph" 4 | ui,prometheus,critical,"/metrics" 5 | 6 | # Less important endpoints with low latency expected. 7 | ui,prometheus,high_fast,"/flags" 8 | ui,prometheus,high_fast,"/static/*filepath" 9 | ui,prometheus,high_fast,"/version" 10 | ui,prometheus,high_fast,"/config" 11 | 12 | # Endpoints which should be highly available but latency can be higher depending on amount of rendered data. 13 | ui,prometheus,high_slow,"/federate" 14 | ui,prometheus,high_slow,"/targets" 15 | ui,prometheus,high_slow,"/service-discovery" 16 | ui,prometheus,high_slow,"/rules" 17 | ui,prometheus,high_slow,"/alerts" 18 | ui,prometheus,high_slow,"/consoles/*filepath" 19 | -------------------------------------------------------------------------------- /docs/modules/relabel.md: -------------------------------------------------------------------------------- 1 | # Relabel 2 | 3 | | | | 4 | |----------------|--------------| 5 | | `moduleName` | `relabel` | 6 | | Module type | `processor` | 7 | | Input event | `raw` | 8 | | Output event | `raw` | 9 | 10 | This module allows you to modify the event metadata or drop the event at all. 11 | It uses native Prometheus `relabel_config` syntax. In this case metadata is referred as labels. 12 | See [the upstream documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config) 13 | for more info. Referenced metadata keys needs to be a valid Prometheus' label name. 14 | 15 | 16 | `moduleConfig` 17 | ```yaml 18 | eventRelabelConfigs: 19 | - 20 | ``` 21 | 22 | You can find some [examples here](/examples). 23 | -------------------------------------------------------------------------------- /prometheus/alerts/slo_data_corrections.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: slo-data-corrections-alert 3 | interval: 3m 4 | rules: 5 | 6 | # This is generic alert which fires when there is not needed SLO data correction metric. 7 | # To make it work, please make sure you data correction rule exposes always 0 when it is not active. 8 | - alert: "Expired SLO data correction" 9 | expr: 'avg_over_time(slo:correction_window[10m]) == 0' 10 | for: 10m 11 | labels: 12 | severity: info 13 | team: sre 14 | annotations: 15 | title: "Some SLO data correction is no longer used" 16 | description: "SLO data correction for incident {{$labels.incident}} is no longer used. You can erase it." 17 | playbook: "howto/slo-data-correction.md" 18 | 19 | 20 | -------------------------------------------------------------------------------- /test/Test_MetricsInitialization/logs: -------------------------------------------------------------------------------- 1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" uct="0.000" uht="0.127" urt="0.127" cc="frontend-api" occ="-" url="532" ourl="-" 2 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 404 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" uct="0.000" uht="0.127" urt="0.127" cc="frontend-api" occ="-" url="532" ourl="-" 3 | -------------------------------------------------------------------------------- /prometheus/alerts/error-budget.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: slo-exporter-slo-error-budget-alerts 3 | rules: 4 | 5 | - alert: ErrorBudgetExhausted 6 | expr: 7 | slo:stable_version{enabled!="false"} 8 | * on(slo_version, slo_domain, namespace) group_right(escalate, team) 9 | slo:violation_ratio{slo_time_range="4w"} 10 | / on (slo_class,slo_domain,slo_version,slo_type,namespace) group_left () 11 | ( 12 | slo:violation_ratio_threshold - 1 13 | ) 14 | +1 <= 0 15 | for: 10m 16 | labels: 17 | severity: warning 18 | alert_type: slo:error_budget_exhausted 19 | annotations: 20 | title: 'Error budget is exhausted.' 21 | description: '{{$labels.slo_type | title}} error budget for SLO domain "{{$labels.slo_domain}}" was exhausted.' 22 | playbook: howto/SLO-workflow.md 23 | -------------------------------------------------------------------------------- /examples/kafka/README.md: -------------------------------------------------------------------------------- 1 | # Kafka ingester SLO example 2 | 3 | This example shows a simple configuration of slo-exporter using 4 | [`kafka_ingester`](/docs/modules/kafka_ingester.md) 5 | as a source of data in order to compute SLO of a server which publishes events through Kafka. 6 | 7 | #### How to run it 8 | In root of the repo 9 | ```bash 10 | make docker 11 | cd examples/kafka 12 | docker compose up -d 13 | ``` 14 | Once started see http://localhost:8080/metrics. 15 | 16 | ## How SLO is computed 17 | Kafkacat is used to publish events to Kafka on behalf of an imaginary server. Each event contains its SLO classification together with metadata. 18 | 19 | ## Observed SLO types 20 | #### `availability` 21 | All events whose "result" metadata's key equals to "OK" are considered successful. 22 | 23 | #### `quality` 24 | All events whose all quality degradation tracking metadata's key(s) equals to 0 are considered successful. 25 | -------------------------------------------------------------------------------- /examples/all_in_one/grafana/provisioning/datasources/datasource.yml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | # list of datasources that should be deleted from the database 5 | deleteDatasources: 6 | - name: Prometheus 7 | orgId: 1 8 | 9 | # list of datasources to insert/update depending 10 | # what's available in the database 11 | datasources: 12 | # name of the datasource. Required 13 | - name: Prometheus 14 | # datasource type. Required 15 | type: prometheus 16 | # access mode. proxy or direct (Server or Browser in the UI). Required 17 | access: proxy 18 | # org id. will default to orgId 1 if not specified 19 | orgId: 1 20 | # custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically 21 | uid: my_unique_uid 22 | # url 23 | url: http://prometheus:9090 24 | -------------------------------------------------------------------------------- /test/Test_MetricsInitialization/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: testdo.* 4 | failure_conditions: 5 | - operator: numberIsHigherThan 6 | key: statusCode 7 | value: 499 8 | additional_metadata: 9 | slo_type: availability 10 | slo_version: 1 11 | 12 | - slo_matcher: 13 | domain: testdomain 14 | class: high_fast 15 | failure_conditions: 16 | - operator: numberIsHigherThan 17 | key: requestDuration 18 | value: 8 19 | additional_metadata: 20 | slo_version: 1 21 | slo_type: latency90 22 | percentile: 90 23 | le: 8.0 24 | 25 | - slo_matcher: 26 | domain: testdomain 27 | class: high_fast 28 | failure_conditions: 29 | - operator: numberIsHigherThan 30 | key: requestDuration 31 | value: 16 32 | additional_metadata: 33 | slo_version: 1 34 | slo_type: latency99 35 | percentile: 99 36 | le: 16.0 37 | -------------------------------------------------------------------------------- /examples/envoy_proxy/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | envoy: 5 | network_mode: "host" 6 | image: envoyproxy/envoy:v1.16-latest 7 | volumes: 8 | - "./envoy/envoy.yaml:/conf/envoy.yaml:ro" 9 | command: 10 | - "-c" 11 | - "/conf/envoy.yaml" 12 | 13 | slo-exporter: 14 | network_mode: "host" 15 | image: slo_exporter:latest 16 | working_dir: /slo-exporter 17 | command: 18 | - "--config-file=/slo-exporter/slo_exporter.yaml" 19 | - "--log-level=debug" 20 | volumes: 21 | - ./slo-exporter/:/slo-exporter/ 22 | 23 | slo-event-generator: 24 | network_mode: "host" 25 | image: curlimages/curl 26 | entrypoint: /bin/sh 27 | command: | 28 | -c 'while true; do 29 | for i in `seq 20`; do curl -s -H "slo-domain: example-domain" -H "slo-class: critical" -H "slo-app: homepage-static" http://localhost:8080/ >/dev/null 2>&1 ; done; 30 | echo -n "."; 31 | sleep 5; 32 | done' -------------------------------------------------------------------------------- /test/Test_SloHeaders/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: userportal 4 | failure_conditions: 5 | - operator: numberIsHigherThan 6 | key: statusCode 7 | value: 499 8 | additional_metadata: 9 | slo_type: availability 10 | slo_version: 1 11 | 12 | - metadata_matcher: 13 | - operator: isMatchingRegexp 14 | key: requestDuration 15 | value: ".*" 16 | slo_matcher: 17 | domain: userportal 18 | class: critical 19 | failure_conditions: 20 | - operator: numberIsHigherThan 21 | key: requestDuration 22 | value: 8 23 | additional_metadata: 24 | slo_version: 1 25 | slo_type: latency90 26 | percentile: 90 27 | le: 8.0 28 | 29 | - slo_matcher: 30 | domain: userportal 31 | class: critical 32 | failure_conditions: 33 | - operator: numberIsHigherThan 34 | key: requestDuration 35 | value: 16 36 | additional_metadata: 37 | slo_version: 1 38 | slo_type: latency99 39 | percentile: 99 40 | le: 16.0 41 | -------------------------------------------------------------------------------- /pkg/event/slo_classification.go: -------------------------------------------------------------------------------- 1 | package event 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/seznam/slo-exporter/pkg/stringmap" 7 | ) 8 | 9 | type SloClassification struct { 10 | Domain string 11 | App string 12 | Class string 13 | } 14 | 15 | func (sc *SloClassification) Matches(other SloClassification) bool { 16 | if sc.Domain != "" && (sc.Domain != other.Domain) { 17 | return false 18 | } 19 | if sc.Class != "" && (sc.Class != other.Class) { 20 | return false 21 | } 22 | if sc.App != "" && (sc.App != other.App) { 23 | return false 24 | } 25 | return true 26 | } 27 | 28 | func (sc *SloClassification) GetMetadata() stringmap.StringMap { 29 | return stringmap.StringMap{ 30 | "slo_domain": sc.Domain, 31 | "slo_class": sc.Class, 32 | "app": sc.App, 33 | } 34 | } 35 | 36 | func (sc *SloClassification) Copy() SloClassification { 37 | return SloClassification{ 38 | Domain: sc.Domain, 39 | App: sc.App, 40 | Class: sc.Class, 41 | } 42 | } 43 | 44 | func (sc *SloClassification) String() string { 45 | return fmt.Sprintf("%s:%s:%s", sc.Domain, sc.App, sc.Class) 46 | } 47 | -------------------------------------------------------------------------------- /scripts/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | output_folder=${1:-"profile"} 4 | packages="$(go list ./... | grep -v /vendor/ | xargs echo)" 5 | mkdir -p "$output_folder" 6 | 7 | for package_path in $packages; do 8 | package_name="$(basename "$package_path")" 9 | cpu_profile_file="${output_folder}/${package_name}_cpu.profile" 10 | memory_profile_file="${output_folder}/${package_name}_memory.profile" 11 | block_profile_file="${output_folder}/${package_name}_block.profile" 12 | go test \ 13 | --benchmem \ 14 | -cpuprofile="$cpu_profile_file" \ 15 | -memprofile="$memory_profile_file" \ 16 | -blockprofile="$block_profile_file" \ 17 | -bench=. \ 18 | -count 5 \ 19 | "${package_path}" 20 | if [ -e "$cpu_profile_file" ]; then 21 | go tool pprof -png "$cpu_profile_file" >"${cpu_profile_file}.png" 22 | fi 23 | if [ -e "$memory_profile_file" ]; then 24 | go tool pprof -png "$memory_profile_file" >"${memory_profile_file}.png" 25 | fi 26 | if [ -e "$block_profile_file" ]; then 27 | go tool pprof -png "$block_profile_file" >"${block_profile_file}.png" 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /examples/envoy_proxy/slo-exporter/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: example-domain 4 | failure_conditions: 5 | - operator: numberIsEqualOrHigherThan 6 | key: responseCode 7 | value: 500 8 | additional_metadata: 9 | slo_type: availability 10 | slo_version: 1 11 | namespace: test 12 | 13 | - slo_matcher: 14 | domain: example-domain 15 | class: critical 16 | failure_conditions: 17 | - operator: durationIsHigherThan 18 | key: timeToLastDownstreamTxByte 19 | value: 10ms 20 | additional_metadata: 21 | slo_version: 1 22 | slo_type: latency90 23 | percentile: 90 24 | le: 0.01 25 | namespace: test 26 | 27 | - slo_matcher: 28 | domain: example-domain 29 | class: critical 30 | failure_conditions: 31 | - operator: durationIsHigherThan 32 | key: timeToLastDownstreamTxByte 33 | value: 50ms 34 | additional_metadata: 35 | slo_version: 1 36 | slo_type: latency99 37 | percentile: 99 38 | le: 0.05 39 | namespace: test 40 | -------------------------------------------------------------------------------- /examples/all_in_one/nginx/conf/nginx.conf: -------------------------------------------------------------------------------- 1 | events { 2 | worker_connections 1024; 3 | } 4 | 5 | http { 6 | server_tokens off; 7 | include mime.types; 8 | charset utf-8; 9 | 10 | log_format upstream_time '$remote_addr - $remote_user [$time_local] ' 11 | '"$request" $status $body_bytes_sent ' 12 | '"$http_referer" "$http_user_agent" ' 13 | 'rt=$request_time uct="$upstream_connect_time" uht="$upstream_header_time" urt="$upstream_response_time"'; 14 | 15 | access_log /nginx/logs/access_log upstream_time; 16 | 17 | limit_req_zone $binary_remote_addr zone=one:10m rate=1r/m; 18 | 19 | server { 20 | server_name localhost; 21 | listen 0.0.0.0:8080; 22 | 23 | set $content_class static; 24 | location / { 25 | return 200; 26 | } 27 | 28 | location /err { 29 | return 500; 30 | } 31 | 32 | location /drop { 33 | # delay incoming requests so that the client will timeout 34 | limit_req zone=one burst=5; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /prometheus/alerts/missing_all_data.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: slo-missing-all-data 3 | rules: 4 | 5 | # This is a generic alert which fires when there are no SLO data present 6 | # and thanos-rule is having trouble evaluating rules. 7 | # This typically means that gaps in SLO data are occurring. 8 | - alert: MissingAllSloData 9 | expr: | 10 | (1 - avg_over_time( 11 | (clamp_max(sum(absent(slo:events_over_time 12 | * on(slo_version, slo_domain, namespace) group_left(escalate, team) slo:stable_version{enabled!="false"})) 13 | AND sum(increase(prometheus_rule_evaluation_failures_total[5m]) > 0),0) 14 | OR clamp_max(sum(slo:events_over_time 15 | * on(slo_version, slo_domain, namespace) group_left(escalate, team) slo:stable_version{enabled!="false"}),1))[1h:]) 16 | ) == 1 17 | for: 10m 18 | labels: 19 | severity: warning 20 | alert_type: slo:missing_data 21 | annotations: 22 | title: 'Missing SLO data.' 23 | description: 'No SLO events are occurring and thanos-rule has trouble evaluating rules.' 24 | playbook: "on-call/slo-missing-data.md" 25 | -------------------------------------------------------------------------------- /examples/all_in_one/slo-exporter/conf/slo_rules.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | - slo_matcher: 3 | domain: example-domain 4 | failure_conditions: 5 | - operator: numberIsEqualOrHigherThan 6 | key: statusCode 7 | value: 500 8 | additional_metadata: 9 | slo_type: availability 10 | slo_version: 1 11 | namespace: test 12 | cluster: test-cluster 13 | 14 | - slo_matcher: 15 | domain: example-domain 16 | class: critical 17 | failure_conditions: 18 | - operator: numberIsHigherThan 19 | key: requestDuration 20 | value: 0.8 21 | additional_metadata: 22 | slo_version: 1 23 | slo_type: latency90 24 | percentile: 90 25 | le: 0.8 26 | namespace: test 27 | cluster: test-cluster 28 | 29 | - slo_matcher: 30 | domain: example-domain 31 | class: critical 32 | failure_conditions: 33 | - operator: numberIsHigherThan 34 | key: requestDuration 35 | value: 2 36 | additional_metadata: 37 | slo_version: 1 38 | slo_type: latency99 39 | percentile: 99 40 | le: 2 41 | namespace: test 42 | cluster: test-cluster 43 | -------------------------------------------------------------------------------- /examples/envoy_proxy/slo-exporter/slo_exporter.yaml: -------------------------------------------------------------------------------- 1 | webServerListenAddress: "0.0.0.0:8001" 2 | maximumGracefulShutdownDuration: "10s" 3 | afterPipelineShutdownDelay: "1s" 4 | 5 | pipeline: ["envoyAccessLogServer", "relabel", "eventKeyGenerator", "metadataClassifier", "sloEventProducer", "prometheusExporter"] 6 | 7 | modules: 8 | 9 | envoyAccessLogServer: {} 10 | 11 | relabel: 12 | eventRelabelConfigs: 13 | # Drop events with unwanted status codes 14 | - source_labels: ["responseCode"] 15 | regex: "30[12]|40[045]|411|408|499" 16 | action: drop 17 | 18 | eventKeyGenerator: 19 | filedSeparator: ":" 20 | metadataKeys: 21 | - requestMethod 22 | - path 23 | 24 | metadataClassifier: 25 | sloDomainMetadataKey: http_slo-domain 26 | sloClassMetadataKey: http_slo-class 27 | sloAppMetadataKey: http_slo-app 28 | 29 | sloEventProducer: 30 | rulesFiles: 31 | - "slo_rules.yaml" 32 | 33 | prometheusExporter: 34 | metricName: "slo_events_total" 35 | labelNames: 36 | result: "result" 37 | sloDomain: "slo_domain" 38 | sloClass: "slo_class" 39 | sloApp: "slo_app" 40 | eventKey: "event_key" 41 | -------------------------------------------------------------------------------- /prometheus/alerts/missing_data.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: slo-missing-data 3 | rules: 4 | 5 | # Missing slo:burn_rate for enabled SLO domain which had non-zero events in the past 5 minutes 6 | - alert: MissingSloBurnRateDomain 7 | expr: | 8 | ( 9 | sum( 10 | slo:stable_version{enabled="true"} 11 | ) by (slo_version, slo_domain, namespace, escalate, team) 12 | unless on (slo_domain, slo_version, namespace) 13 | ( 14 | slo:burn_rate{slo_time_range='5m'} 15 | * on(slo_version, slo_domain, namespace) group_left(escalate, team) 16 | slo:stable_version{enabled="true"} 17 | ) 18 | ) 19 | and on(slo_version, slo_domain, namespace) 20 | sum( 21 | slo:events_over_time{slo_time_range="5m"} 22 | ) without (result, slo_type) > 0 23 | for: 5m 24 | labels: 25 | severity: critical 26 | alert_type: slo:missing_data 27 | annotations: 28 | title: 'Missing burn rate data for {{ $labels.slo_domain }}.' 29 | description: 'Burn rate probably failed to evaluate for {{ $labels.slo_domain }}.' 30 | playbook: "on-call/slo-missing-data.md" 31 | -------------------------------------------------------------------------------- /docs/modules/metadata_classifier.md: -------------------------------------------------------------------------------- 1 | # Metadata classifier 2 | 3 | | | | 4 | |----------------|------------------------------| 5 | | `moduleName` | `metadataClassifier` | 6 | | Module type | `processor` | 7 | | Input event | `raw` | 8 | | Output event | `raw` | 9 | 10 | This module allows you to classify an event using its metadata. 11 | Specify keys which values will be used as according slo classification items. 12 | If the key cannot be found, original value of classification will be left intact. 13 | By default, the module will override event classification. 14 | This can be disabled to classify it only if it wasn't classified before. 15 | 16 | `moduleConfig` 17 | ```yaml 18 | # Key of metadata value to be used as classification slo domain. 19 | sloDomainMetadataKey: 20 | # Key of metadata value to be used as classification slo domain. 21 | sloClassMetadataKey: 22 | # Key of metadata value to be used as classification slo domain. 23 | sloAppMetadataKey: 24 | # If classification of already classified event should be overwritten. 25 | overrideExistingValues: true 26 | ``` 27 | -------------------------------------------------------------------------------- /pkg/event/slo.go: -------------------------------------------------------------------------------- 1 | package event 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/seznam/slo-exporter/pkg/stringmap" 7 | ) 8 | 9 | type Result string 10 | 11 | func (r Result) String() string { 12 | return string(r) 13 | } 14 | 15 | const ( 16 | Success Result = "success" 17 | Fail Result = "fail" 18 | ) 19 | 20 | var PossibleResults = []Result{Success, Fail} 21 | 22 | type Slo struct { 23 | // same value as in source event Raw.EventKey() 24 | Key string 25 | Result Result 26 | 27 | Domain string 28 | Class string 29 | App string 30 | 31 | Metadata stringmap.StringMap 32 | Quantity float64 33 | 34 | OriginalEvent Raw 35 | } 36 | 37 | func (s *Slo) IsClassified() bool { 38 | return s.Domain != "" && s.Class != "" && s.App != "" 39 | } 40 | 41 | func (s *Slo) String() string { 42 | return fmt.Sprintf("SLO event %q of domain: %q, class: %q, app: %q with metadata: %+v", s.Key, s.Domain, s.Class, s.App, s.Metadata) 43 | } 44 | 45 | func (s Slo) Copy() Slo { 46 | return Slo{ 47 | Key: s.Key, 48 | Result: s.Result, 49 | Domain: s.Domain, 50 | Class: s.Class, 51 | App: s.App, 52 | Metadata: s.Metadata.Copy(), 53 | OriginalEvent: s.OriginalEvent, 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /scripts/generate_godoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CURRENT_PKG=$(go list -m) 3 | INTERFACE=localhost:6060 4 | 5 | DST_DIR=${1:-"public/godoc"} 6 | 7 | # run a godoc server 8 | go get golang.org/x/tools/cmd/godoc 9 | godoc -http=$INTERFACE & DOC_PID=$! 10 | 11 | sleep 10 12 | # Wait for the server to start 13 | until curl -sSf "http://$INTERFACE/pkg/$CURRENT_PKG/" > /dev/null 14 | do 15 | sleep 1 16 | done 17 | sleep 1 18 | 19 | # recursive fetch entire web including CSS & JS 20 | # turn off robots check, otherwise might get blocked with details in `robots.txt` file 21 | # only get the directories we are looking for 22 | wget -r -p \ 23 | -e robots=off \ 24 | --include-directories="/lib/godoc,/pkg/$CURRENT_PKG,/src/$CURRENT_PKG" \ 25 | --exclude-directories="/pkg/$CURRENT_PKG/vendor,/src/$CURRENT_PKG/vendor" \ 26 | "http://$INTERFACE/pkg/$CURRENT_PKG/" 27 | 28 | # Stop the godoc server 29 | kill -9 $DOC_PID 30 | 31 | # all file will be generated into `localhost:6060` folder, hence we move them out from docker to local machine 32 | mkdir -p "$(dirname "$DST_DIR")" 33 | rm -rf "$DST_DIR" 34 | mv "$INTERFACE" "$DST_DIR" 35 | # replace relative links 36 | find "$DST_DIR" -name "*.html" -exec sed -Ei 's/\/(lib|src|pkg)\//\/slo-exporter\/godoc\/\1\//g' {} + 37 | -------------------------------------------------------------------------------- /pkg/prober/prober_test.go: -------------------------------------------------------------------------------- 1 | package prober 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/sirupsen/logrus" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestProber(t *testing.T) { 14 | p, err := NewLiveness(prometheus.NewRegistry(), logrus.New()) 15 | assert.NoError(t, err) 16 | p.Ok() 17 | assert.Equal(t, nil, p.IsOk()) 18 | p.NotOk(ErrDefault) 19 | assert.Equal(t, ErrDefault, p.IsOk()) 20 | p.Ok() 21 | assert.Equal(t, nil, p.IsOk()) 22 | } 23 | 24 | func TestProber_HandleFunc(t *testing.T) { 25 | p, err := NewLiveness(prometheus.NewRegistry(), logrus.New()) 26 | assert.NoError(t, err) 27 | req, err := http.NewRequest(http.MethodGet, "/liveness", http.NoBody) 28 | if err != nil { 29 | t.Fatal(err) 30 | } 31 | handler := http.HandlerFunc(p.HandleFunc) 32 | 33 | rr := httptest.NewRecorder() 34 | handler.ServeHTTP(rr, req) 35 | assert.Equal(t, http.StatusOK, rr.Code) 36 | 37 | rr = httptest.NewRecorder() 38 | p.NotOk(ErrDefault) 39 | handler.ServeHTTP(rr, req) 40 | assert.Equal(t, http.StatusServiceUnavailable, rr.Code) 41 | 42 | rr = httptest.NewRecorder() 43 | p.Ok() 44 | handler.ServeHTTP(rr, req) 45 | assert.Equal(t, http.StatusOK, rr.Code) 46 | } 47 | -------------------------------------------------------------------------------- /test/Test_SloHeadersUpdateCache/metrics: -------------------------------------------------------------------------------- 1 | # HELP slo_domain_slo_class_slo_app_event_key:slo_events_total Total number of SLO events exported with it's result and metadata. 2 | # TYPE slo_domain_slo_class_slo_app_event_key:slo_events_total counter 3 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="fail",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 0 4 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="success",slo_app="frontend-api",slo_class="critical",slo_domain="userportal",slo_type="availability",slo_version="1"} 2 5 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="fail",slo_app="frontend-api",slo_class="high_fast",slo_domain="userportal",slo_type="availability",slo_version="1"} 0 6 | slo_domain_slo_class_slo_app_event_key:slo_events_total{event_key="GET:/api/v1/ppchit/rule/0",result="success",slo_app="frontend-api",slo_class="high_fast",slo_domain="userportal",slo_type="availability",slo_version="1"} 1 7 | # HELP slo_exporter_tailer_lines_read_total Total number of lines tailed from the file. 8 | # TYPE slo_exporter_tailer_lines_read_total counter 9 | slo_exporter_tailer_lines_read_total 3 10 | -------------------------------------------------------------------------------- /docs/modules/event_key_generator.md: -------------------------------------------------------------------------------- 1 | # Event key generator 2 | 3 | | | | 4 | |----------------|---------------------| 5 | | `moduleName` | `eventKeyGenerator` | 6 | | Module type | `processor` | 7 | | Input event | `raw` | 8 | | Output event | `raw` | 9 | 10 | This module allows you to generate an identifier of the event type. 11 | It will join all values of specified event metadata keys (if found) using the separator 12 | and use it as the new identifier. 13 | 14 | `moduleConfig` 15 | ```yaml 16 | # Separator to be used to join the selected metadata values. 17 | filedSeparator: ":" 18 | # If the event key should be overwritten if it's already set for the input event. 19 | overrideExistingEventKey: true 20 | # Keys which values will be joined as the resulting eventKey in specified order 21 | metadataKeys: 22 | - 23 | ``` 24 | 25 | If given metadata_key is missing in the event's metadata, the empty value is not included in the resulting eventKey. 26 | 27 | E.g. given the following configuration: 28 | ``` 29 | metadataKeys: 30 | app: test_app 31 | name: test_name 32 | endpoint: test_endpoint 33 | ``` 34 | The following metadata `{'app': 'test_app', 'endpoint': 'test_endpoint'}` would result to event_key `test_app:test_endpoint`. 35 | 36 | -------------------------------------------------------------------------------- /pkg/slo_event_producer/config.go: -------------------------------------------------------------------------------- 1 | package slo_event_producer 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/seznam/slo-exporter/pkg/stringmap" 8 | "gopkg.in/yaml.v2" 9 | ) 10 | 11 | type sloMatcher struct { 12 | DomainRegexp string `yaml:"domain"` 13 | ClassRegexp string `yaml:"class"` 14 | AppRegexp string `yaml:"app"` 15 | } 16 | 17 | type operatorOptions struct { 18 | Operator string `yaml:"operator"` 19 | Key string `yaml:"key"` 20 | Value string `yaml:"value"` 21 | } 22 | 23 | type ruleOptions struct { 24 | MetadataMatcherConditionsOptions []operatorOptions `yaml:"metadata_matcher"` 25 | SloMatcher sloMatcher `yaml:"slo_matcher"` 26 | FailureConditionsOptions []operatorOptions `yaml:"failure_conditions"` 27 | AdditionalMetadata stringmap.StringMap `yaml:"additional_metadata,omitempty"` 28 | } 29 | 30 | type rulesConfig struct { 31 | Rules []ruleOptions `yaml:"rules"` 32 | } 33 | 34 | func (rc *rulesConfig) loadFromFile(path string) error { 35 | yamlFile, err := os.ReadFile(path) 36 | if err != nil { 37 | return fmt.Errorf("failed to load configuration file: %w", err) 38 | } 39 | err = yaml.UnmarshalStrict(yamlFile, rc) 40 | if err != nil { 41 | return fmt.Errorf("failed to unmarshall configuration file: %w", err) 42 | } 43 | return nil 44 | } 45 | -------------------------------------------------------------------------------- /test/Test_SloHeadersUpdateCache/logs: -------------------------------------------------------------------------------- 1 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="-" slo-app="-" slo-class="-" slo-endpoint="-" slo-result="-" 2 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="userportal" slo-app="frontend-api" slo-class="critical" slo-endpoint="-" slo-result="success" 3 | 127.0.0.1 - - [12/Nov/2019:10:26:00 +0100] "GET /api/v1/ppchit/rule/5dca7aa7713c09001003cf46 HTTP/1.1" 200 352 "https://www.sklik.cz/automatic-rules?table=(limit:10,page:1)" uag="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/3.4.1 Chrome/61.0.3163.100 Electron/2.0.18 Safari/537.36" "-" ua="127.0.0.1:6050" rt="0.127" frpc-status="-" slo-domain="-" slo-app="-" slo-class="-" slo-endpoint="-" slo-result="-" 4 | -------------------------------------------------------------------------------- /docs/operating.md: -------------------------------------------------------------------------------- 1 | # Operating 2 | 3 | ## Debugging 4 | If you need to dynamically change the log level of the application, you can use the `/logging` HTTP endpoint. 5 | To set the log level use the `POST` method with URL parameter `level` of value `error`, `warning`, `info` or `debug`. 6 | 7 | Example using `cURL` 8 | ```bash 9 | # Use GET to get current log level. 10 | $ curl -s http://0.0.0.0:8080/logging 11 | current logging level is: debug 12 | 13 | # Use POST to set the log level. 14 | $ curl -XPOST -s http://0.0.0.0:8080/logging?level=info 15 | logging level set to: info 16 | ``` 17 | 18 | #### Profiling 19 | In case of issues with leaking resources for example, slo-exporter supports the 20 | Go profiling using pprof on `/debug/pprof/` web interface path. For usage see the official [docs](https://golang.org/pkg/net/http/pprof/). 21 | 22 | 23 | ## Frequently asked questions 24 | 25 | ### How to add new normalization replacement rule? 26 | Event normalization can be done using the `relabel` module, see [its documentation](modules/relabel.md). 27 | 28 | ### How to deal with malformed lines? 29 | Before !87. If you are seeing too many malformed lines then you should inspect [tailer package](pkg/tailer/tailer.go) and seek for variable `lineParseRegexp`. 30 | After !87, slo-exporter main config supports to specify custom regular expression in field `.module.tailer.loglineParseRegexp`. 31 | -------------------------------------------------------------------------------- /kubernetes/slo-exporter-statefulset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: slo-exporter 5 | spec: 6 | replicas: 2 7 | serviceName: slo-exporter 8 | podManagementPolicy: Parallel 9 | selector: 10 | matchLabels: 11 | app: slo-exporter 12 | template: 13 | metadata: 14 | labels: 15 | app: slo-exporter 16 | name: slo-exporter 17 | annotations: 18 | prometheus.metrics.scrape: "true" 19 | prometheus.metrics.port: "8080" 20 | prometheus.metrics.path: "/metrics" 21 | spec: 22 | containers: 23 | - name: slo-exporter 24 | image: seznam/slo-exporter:latest 25 | workingDir: "/slo-exporter-config" 26 | args: 27 | - --config-file=slo_exporter.yaml 28 | resources: 29 | requests: 30 | cpu: "0.1" 31 | memory: "100Mi" 32 | limits: 33 | cpu: "0.5" 34 | memory: "250Mi" 35 | ports: 36 | - containerPort: 8080 37 | protocol: TCP 38 | - containerPort: 18090 39 | protocol: TCP 40 | volumeMounts: 41 | - name: slo-exporter-config 42 | mountPath: "/slo-exporter-config" 43 | volumes: 44 | - name: slo-exporter-config 45 | configMap: 46 | name: slo-exporter-config 47 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | SLO-exporter is written in Go and built using [the pipeline pattern](https://blog.golang.org/pipelines). 3 | 4 | The processed event is passed from one module to another to allow its modification or filtering 5 | for the final state to be reported as an SLI event. 6 | 7 | The flow of the processing pipeline can be dynamically set using configuration file, so it can be used 8 | for various use cases and event types. 9 | 10 | ### Event Types 11 | Slo-exporter differentiates between two event types: 12 | 13 | ##### Raw 14 | This is an event which came from the data source, it has metadata and quantity 15 | and you can set its event key which will be in the resulting metrics and can be used for classification of the event. 16 | 17 | ##### SLO event 18 | Final event generated from the raw event. This event has already evaluated result and classification 19 | an is then reported to output metrics. 20 | 21 | ### Module types 22 | There is set of implemented modules to be used and are divided to three basic types based on their input/output. 23 | 24 | ##### `producer` 25 | Does not read any events but produces them. These modules serve as sources of the events. 26 | 27 | ##### `ingester` 28 | Reads events but does not produce any. These modules serves for reporting the SLO metrics to some external systems. 29 | 30 | ##### `processor` 31 | Combination of `producer` and `ingester`. It reads an event and produces new or modified one. 32 | -------------------------------------------------------------------------------- /examples/all_in_one/README.md: -------------------------------------------------------------------------------- 1 | # All-in-one example 2 | 3 | ### Overview 4 | Use the provided [docker-compose](./docker-compose.yaml) to start the complete setup with 5 | Prometheus instance loaded with [example SLO recording rules and alerts](/prometheus), 6 | and Grafana instance with loaded [SLO dashboards](/grafana_dashboards). 7 | 8 | Description of the whole setup follows: 9 | - **Nginx configured with the following paths:** 10 | - `nginx:8080/` -> `HTTP 200`, all ok 11 | - `nginx:8080/err` -> `HTTP 500`, availability violation 12 | - `nginx:8080/drop`-> `limit 1r/m`, latency violation 13 | - **Slo-exporter configured to tail the Nginx's logs** 14 | - **Prometheus** 15 | - configured to scrape the slo-exporter's metrics 16 | - loaded with necessary recording-rules for SLO computation 17 | - **Grafana** 18 | - with Prometheus preconfigured as a datasource 19 | - loaded with [SLO dashboards](/grafana_dashboards/) 20 | - **Slo-event-generator** 21 | - an infinite loop accessing the Nginx instance to generate slo-events. 22 | 23 | ### How to run it 24 | ```bash 25 | docker-compose pull && docker-compose up 26 | ``` 27 | 28 | To access Grafana and Prometheus: 29 | ``` 30 | # http://localhost:9090 Prometheus 31 | # http://localhost:3000 Grafana 32 | # User: admin 33 | # Password: admin 34 | ``` 35 | 36 | **Please note that it may take up to 5 minutes until Grafana dashboards will show any data. This is caused by evaluation interval of the included Prometheus recording rules.** 37 | -------------------------------------------------------------------------------- /examples/envoy_proxy/README.md: -------------------------------------------------------------------------------- 1 | # Envoy proxy SLO example 2 | 3 | This example shows a simple configuration of slo-exporter using 4 | [`envoy access-log-server module`](/docs/modules/envoy_access_log_server.md). 5 | 6 | #### How to run it 7 | In root of the repo 8 | ```bash 9 | make docker 10 | cd examples/envoy_proxy 11 | docker-compose up -d 12 | ``` 13 | Once started, see http://localhost:8001/metrics. 14 | 15 | ## How SLO is computed 16 | - [envoyAccessLogServer module](/docs/modules/envoy_access_log_server.md) is used to receive envoy's logs via grpc. 17 | - [relabel module](/docs/modules/relabel.md) drops the unwanted events (e.g. based on its HTTP status code, userAgent,...). 18 | - [metadataClassifier module](/docs/modules/metadata_classifier.md) classifies generated event based HTTP headers sent by a client 19 | 20 | ## Observed SLO types 21 | Refer to [slo_rules.yaml](./slo-exporter/slo_rules.yaml) for the exact configuration of how SLO events are generated based on input logs/events. 22 | 23 | #### `availability` 24 | For every log line which results in classified event in domain `test-domain`, an SLO event is generated. Its result is determined based on statusCode metadata key - with all events with `statusCode > 500` being marked as failed. 25 | 26 | #### `latency90`, `latency99` 27 | For every log line which results in classified event in domain `test-domain` and slo_class `critical`, an SLO event is generated. Its result is determined based on `timeToLastDownstreamTxByte` metadata key. 28 | -------------------------------------------------------------------------------- /pkg/storage/in_memory.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | ) 7 | 8 | // NewInMemoryCappedContainer create new in-memory capped container with capacity limit. 9 | func NewInMemoryCappedContainer(capacity int) CappedContainer { 10 | return &inMemoryCappedContainer{ 11 | list: list.New(), 12 | capacity: capacity, 13 | } 14 | } 15 | 16 | type inMemoryCappedContainer struct { 17 | list *list.List 18 | capacity int 19 | lock sync.RWMutex 20 | } 21 | 22 | // Len returns current size of container. 23 | func (h *inMemoryCappedContainer) Len() int { 24 | h.lock.RLock() 25 | defer h.lock.RUnlock() 26 | return h.list.Len() 27 | } 28 | 29 | // Capacity returns maximum limit of the capped container. 30 | func (h *inMemoryCappedContainer) Capacity() int { 31 | return h.capacity 32 | } 33 | 34 | // Add adds new item to container. 35 | func (h *inMemoryCappedContainer) Add(record interface{}) { 36 | h.lock.Lock() 37 | defer h.lock.Unlock() 38 | h.list.PushFront(record) 39 | 40 | // Drop items exceeding capacity limit. 41 | if h.list.Len() > h.capacity { 42 | h.list.Remove(h.list.Back()) 43 | } 44 | } 45 | 46 | // Stream writes items to returned channel. 47 | func (h *inMemoryCappedContainer) Stream() <-chan interface{} { 48 | stream := make(chan interface{}) 49 | go func() { 50 | h.lock.RLock() 51 | defer h.lock.RUnlock() 52 | for e := h.list.Front(); e != nil; e = e.Next() { 53 | stream <- e.Value 54 | } 55 | close(stream) 56 | }() 57 | return stream 58 | } 59 | -------------------------------------------------------------------------------- /prometheus/recording_rules/burn-rate.yaml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: slo-violation-ratio-and-burn-rate 3 | interval: 1m 4 | rules: 5 | - record: slo:violation_ratio 6 | expr: | 7 | ( 8 | # Produce zero instead of NaN. 9 | 0 == sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) ( 10 | slo:events_over_time 11 | ) 12 | ) 13 | or on(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) 14 | # Otherwise, compute the actual violation ratio, if possible 15 | ( 16 | sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) ( 17 | slo:events_over_time{result="fail"} 18 | ) 19 | / 20 | sum by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) ( 21 | slo:events_over_time 22 | ) 23 | ) 24 | or on(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) 25 | # Otherwise, if no failed events are present, return 0 violation_ratio for the given set of labels 26 | ( 27 | 0 * count by(namespace, slo_class, slo_domain, slo_time_range, slo_type, slo_version) ( 28 | slo:events_over_time 29 | ) 30 | ) 31 | - record: slo:burn_rate 32 | expr: 33 | slo:violation_ratio 34 | / on(slo_class, slo_domain, slo_version, slo_type, namespace) group_left() 35 | (1 - slo:violation_ratio_threshold) 36 | -------------------------------------------------------------------------------- /tools/slo-rules-generator/slo-domains.yaml.example: -------------------------------------------------------------------------------- 1 | userportal-reports-v6: # config-name 2 | domain: userportal-reports # fallbacks to config-name if not specified 3 | enabled: true 4 | namespace: production 5 | version: 6 6 | alerting: 7 | team: team.x@company.com 8 | escalate: sre.x@company.com 9 | # Thresholds for burn-rate alerts. If not present, the following defaults are used for all domains, classes: 10 | # 1h: 13.44 11 | # 6h: 5.6 12 | # 1d: 2.8 13 | # 3d: 1 14 | # class and slo_type in conditions may be an empty string - which equals to effectively any domain or slo_type 15 | burn_rate_thresholds: 16 | - condition: 17 | class: 'critical' 18 | slo_type: 'availability' 19 | time_range: '1h' 20 | value: 13.44 21 | - condition: 22 | class: 'low' 23 | slo_type: 'latency90' 24 | time_range: '6h' 25 | value: 33.12 26 | classes: 27 | critical: # slo_class name 28 | availability: 29 | slo_threshold: 0.99 30 | latency90: 31 | slo_threshold: 0.90, 32 | # labels added to resulting slo:violation_ratio_threshold metrics 33 | slo_threshold_metadata: 34 | percentile: 90 35 | le: 0.8 # The 0.8 is a maximum duration of event to be considered successfull taken from the slo_rules.yaml 36 | low: 37 | availability: 38 | slo_threshold: 0.99 39 | latency90: 40 | slo_threshold: 0.90 41 | slo_threshold_metadata: 42 | percentile: 90 43 | le: 0.8 44 | -------------------------------------------------------------------------------- /pkg/prometheus_ingester/query_executor_test.go: -------------------------------------------------------------------------------- 1 | package prometheus_ingester 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/prometheus/common/model" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func Test_queryResult_applyStaleness(t *testing.T) { 12 | ts := model.Time(0) 13 | fingerprint := model.Fingerprint(0) 14 | tests := []struct { 15 | name string 16 | input queryResult 17 | staleness time.Duration 18 | ts time.Time 19 | expectedMetrics int 20 | }{ 21 | { 22 | name: "keep recent samples", 23 | ts: ts.Time().Add(time.Minute), 24 | staleness: defaultStaleness, 25 | input: queryResult{ 26 | timestamp: ts.Time(), 27 | metrics: map[model.Fingerprint]model.SamplePair{ 28 | fingerprint: { 29 | Timestamp: ts, 30 | Value: 0, 31 | }, 32 | }, 33 | }, 34 | expectedMetrics: 1, 35 | }, 36 | { 37 | name: "drop outdated samples", 38 | ts: ts.Time().Add(time.Minute + defaultStaleness), 39 | staleness: defaultStaleness, 40 | input: queryResult{ 41 | timestamp: ts.Time(), 42 | metrics: map[model.Fingerprint]model.SamplePair{ 43 | fingerprint: { 44 | Timestamp: ts, 45 | Value: 0, 46 | }, 47 | }, 48 | }, 49 | expectedMetrics: 0, 50 | }, 51 | } 52 | for _, tt := range tests { 53 | t.Run(tt.name, func(t *testing.T) { 54 | tt.input.dropStaleResults(tt.staleness, tt.ts) 55 | assert.Equalf(t, tt.expectedMetrics, len(tt.input.metrics), "unexpected number of metrics in result: %s", tt.input.metrics) 56 | }) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /pkg/event/raw.go: -------------------------------------------------------------------------------- 1 | package event 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/seznam/slo-exporter/pkg/stringmap" 7 | ) 8 | 9 | // Raw represents single event as received by an EventsProcessor instance. 10 | type Raw struct { 11 | Metadata stringmap.StringMap 12 | SloClassification *SloClassification 13 | Quantity float64 14 | } 15 | 16 | const ( 17 | eventKeyMetadataKey = "__eventKey" 18 | ) 19 | 20 | func (r *Raw) EventKey() string { 21 | return r.Metadata[eventKeyMetadataKey] 22 | } 23 | 24 | func (r *Raw) SetEventKey(k string) { 25 | if r.Metadata == nil { 26 | r.Metadata = make(stringmap.StringMap) 27 | } 28 | r.Metadata[eventKeyMetadataKey] = k 29 | } 30 | 31 | // UpdateSLOClassification updates SloClassification field. 32 | func (r *Raw) UpdateSLOClassification(classification *SloClassification) { 33 | r.SloClassification = classification 34 | } 35 | 36 | // IsClassified check if all SloClassification fields are set. 37 | func (r *Raw) IsClassified() bool { 38 | if r.SloClassification != nil && 39 | r.SloClassification.Domain != "" && 40 | r.SloClassification.App != "" && 41 | r.SloClassification.Class != "" { 42 | 43 | return true 44 | } 45 | return false 46 | } 47 | 48 | func (r Raw) GetSloMetadata() stringmap.StringMap { 49 | if r.SloClassification == nil { 50 | return nil 51 | } 52 | metadata := r.SloClassification.GetMetadata() 53 | return metadata 54 | } 55 | 56 | func (r Raw) GetSloClassification() *SloClassification { 57 | return r.SloClassification 58 | } 59 | 60 | func (r Raw) String() string { 61 | return fmt.Sprintf("key: %s, quantity: %f, metadata: %s, classification: %s", r.EventKey(), r.Quantity, r.Metadata, r.GetSloMetadata()) 62 | } 63 | -------------------------------------------------------------------------------- /examples/all_in_one/slo-exporter/conf/slo_exporter.yaml: -------------------------------------------------------------------------------- 1 | webServerListenAddress: "0.0.0.0:8001" 2 | maximumGracefulShutdownDuration: "10s" 3 | afterPipelineShutdownDelay: "1s" 4 | 5 | pipeline: ["tailer", "relabel", "eventKeyGenerator", "dynamicClassifier", "sloEventProducer", "prometheusExporter"] 6 | 7 | modules: 8 | 9 | tailer: 10 | tailedFile: "/logs/access_log" 11 | follow: true 12 | reopen: true 13 | positionFile: "" 14 | positionPersistenceInterval: "2s" 15 | loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) - \S+ \[(?P