├── .github
└── workflows
│ ├── docker-image.yml
│ └── goreleaser.yml
├── .gitignore
├── .goreleaser.yml
├── Dockerfile
├── LICENSE
├── README.md
├── charts
├── README.md
├── archives
│ ├── .gitkeep
│ ├── index.yaml
│ ├── kminion-0.1.0.tgz
│ ├── kminion-0.1.1.tgz
│ ├── kminion-0.1.2.tgz
│ ├── kminion-0.1.3.tgz
│ ├── kminion-0.11.1.tgz
│ ├── kminion-0.11.2.tgz
│ ├── kminion-0.11.3.tgz
│ ├── kminion-0.12.0.tgz
│ ├── kminion-0.2.0.tgz
│ ├── kminion-0.2.1.tgz
│ ├── kminion-0.2.2.tgz
│ ├── kminion-0.3.0.tgz
│ ├── kminion-0.3.1.tgz
│ ├── kminion-0.4.0.tgz
│ ├── kminion-0.5.0.tgz
│ ├── kminion-0.6.0.tgz
│ ├── kminion-0.7.0.tgz
│ ├── kminion-0.8.0.tgz
│ ├── kminion-0.8.1.tgz
│ ├── kminion-0.8.2.tgz
│ ├── kminion-0.8.3.tgz
│ └── kminion-0.9.0.tgz
└── kminion
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ ├── NOTES.txt
│ ├── _helpers.tpl
│ ├── configmap.yaml
│ ├── daemonset.yaml
│ ├── deployment.yaml
│ ├── hpa.yaml
│ ├── ingress.yaml
│ ├── poddisruptionbudget.yaml
│ ├── service.yaml
│ ├── serviceaccount.yaml
│ └── servicemonitor.yaml
│ └── values.yaml
├── config.go
├── docker-compose.yml
├── docs
├── end-to-end.md
├── metrics.md
├── reference-config.yaml
└── screenshots
│ ├── kminion-cluster.png
│ ├── kminion-groups.png
│ └── kminion-topics.png
├── e2e
├── client_hooks.go
├── config.go
├── config_consumer.go
├── config_producer.go
├── config_topic.go
├── consumer.go
├── endtoend_message.go
├── group_tracker.go
├── message_tracker.go
├── producer.go
├── service.go
├── topic.go
├── topic_test.go
└── utils.go
├── go.mod
├── go.sum
├── kafka
├── client_config_helper.go
├── client_logger.go
├── config.go
├── config_sasl.go
├── config_sasl_gssapi.go
├── config_sasl_oauthbearer.go
├── config_tls.go
└── service.go
├── logging
├── config.go
└── logger.go
├── main.go
├── minion
├── client_hooks.go
├── config.go
├── config_consumer_group.go
├── config_log_dirs.go
├── config_topic_config.go
├── consumer_group_offsets.go
├── describe_consumer_groups.go
├── describe_topic_config.go
├── list_offsets.go
├── log_dirs.go
├── metadata.go
├── offset_consumer.go
├── service.go
├── storage.go
├── utils.go
└── versions.go
└── prometheus
├── collect_broker_info.go
├── collect_cluster_info.go
├── collect_consumer_group_lags.go
├── collect_consumer_groups.go
├── collect_exporter_metrics.go
├── collect_log_dirs.go
├── collect_topic_info.go
├── collect_topic_partition_offsets.go
├── config.go
└── exporter.go
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: docker-image
3 | on:
4 | push:
5 | tags: ['*']
6 | branches: ['master']
7 | paths-ignore: ['charts/**']
8 | permissions:
9 | id-token: write
10 | contents: read
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: aws-actions/configure-aws-credentials@v4
16 | with:
17 | aws-region: ${{ vars.RP_AWS_CRED_REGION }}
18 | role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }}
19 | - uses: aws-actions/aws-secretsmanager-get-secrets@v2
20 | with:
21 | secret-ids: |
22 | ,sdlc/prod/github/dockerhub
23 | parse-json-secrets: true
24 | - uses: actions/checkout@v4
25 | - uses: docker/setup-qemu-action@v3
26 | - uses: docker/setup-buildx-action@v3
27 | with:
28 | driver-opts: |
29 | image=moby/buildkit:v0.21.1
30 | network=host
31 | - name: Set build date
32 | run: |
33 | echo "BUILT_AT=$(date --rfc-3339=date)" >> ${GITHUB_ENV}
34 | - uses: docker/metadata-action@v5
35 | id: docker_meta
36 | with:
37 | # list of Docker images to use as base name for tags
38 | images: |
39 | redpandadata/kminion
40 | name=public.ecr.aws/l9j0i2e0/kminion,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
41 | # generate Docker tags based on the following events/attributes
42 | # Semver type is only active on 'push tag' events,
43 | # hence no enable condition required
44 | tags: |
45 | type=sha,prefix={{branch}}-,format=short,enable={{is_default_branch}}
46 | type=semver,pattern={{raw}}
47 | - uses: docker/login-action@v3
48 | with:
49 | username: ${{ env.DOCKERHUB_USER }}
50 | password: ${{ env.DOCKERHUB_TOKEN }}
51 | - uses: aws-actions/configure-aws-credentials@v4
52 | if: ${{ startsWith(github.ref, 'refs/tags/v') }}
53 | with:
54 | aws-region: us-east-1
55 | role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }}
56 | - uses: aws-actions/amazon-ecr-login@v2
57 | if: ${{ startsWith(github.ref, 'refs/tags/v') }}
58 | with:
59 | registry-type: public
60 | - uses: docker/build-push-action@v6
61 | with:
62 | provenance: false
63 | push: true
64 | platforms: linux/amd64,linux/arm64
65 | tags: ${{ steps.docker_meta.outputs.tags }}
66 | build-args: |
67 | VERSION=${{ fromJSON(steps.docker_meta.outputs.json).labels['org.opencontainers.image.version'] }}
68 | BUILT_AT=${{ env.BUILT_AT }}
69 | COMMIT=${{ github.sha }}
70 | cache-from: type=gha
71 | cache-to: type=gha,mode=max
72 |
--------------------------------------------------------------------------------
/.github/workflows/goreleaser.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: goreleaser
3 | on:
4 | push:
5 | tags: ['*']
6 | jobs:
7 | goreleaser:
8 | runs-on: ubuntu-latest
9 | permissions:
10 | contents: write
11 | steps:
12 | - uses: actions/checkout@v4
13 | with:
14 | fetch-depth: 0
15 | - uses: actions/setup-go@v5
16 | with:
17 | go-version-file: 'go.mod'
18 | - uses: goreleaser/goreleaser-action@v6
19 | if: startsWith(github.ref, 'refs/tags/')
20 | with:
21 | version: latest
22 | args: release --clean
23 | workdir: .
24 | env:
25 | CGO_ENABLED: 0
26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, build with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | zk-single-kafka-single
14 | zk-multiple-kafka-multiple
15 | .vscode
16 | .idea
17 |
18 | config
19 | /kminion
20 |
--------------------------------------------------------------------------------
/.goreleaser.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | release:
4 | name_template: '{{.Version}} / {{time "2006-01-02"}}'
5 | prerelease: auto
6 | mode: append
7 | footer: |
8 | ## Docker Image
9 | Use the following command to pull this release's Docker image:
10 | ```sh
11 | docker pull redpandadata/kminion:{{ .Tag }}
12 | ```
13 | changelog:
14 | disable: false
15 | use: github
16 | filters:
17 | # Commit messages matching the regexp listed here will be removed from the changelog
18 | exclude:
19 | - '^docs:'
20 | - '^test:'
21 | - '^npm:'
22 | - '^go.mod:'
23 | - '^.github:'
24 | - 'Merge branch'
25 | builds:
26 | - id: kminion
27 | binary: kminion
28 | goos:
29 | - darwin
30 | - linux
31 | - windows
32 | goarch:
33 | - amd64
34 | - arm64
35 | ldflags:
36 | - -s -w -X main.version={{.Version}} -X main.builtAt={{.Date}} -X main.commit={{.Commit}}
37 | checksum:
38 | name_template: 'checksums.txt'
39 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Build image
3 | ############################################################
4 | FROM golang:1.24-alpine AS builder
5 |
6 | ARG VERSION
7 | ARG BUILT_AT
8 | ARG COMMIT
9 |
10 | RUN apk update && apk add --no-cache git ca-certificates && update-ca-certificates
11 |
12 | WORKDIR /app
13 |
14 | COPY go.mod .
15 | COPY go.sum .
16 | RUN go mod download
17 |
18 | COPY . .
19 |
20 | RUN CGO_ENABLED=0 go build \
21 | -ldflags="-w -s \
22 | -X main.version=$VERSION \
23 | -X main.commit=$COMMIT \
24 | -X main.builtAt=$BUILT_AT" \
25 | -o ./bin/kminion
26 |
27 | ############################################################
28 | # Runtime Image
29 | ############################################################
30 | FROM alpine:3
31 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
32 | COPY --from=builder /app/bin/kminion /app/kminion
33 | RUN addgroup -S redpanda \
34 | && adduser -S redpanda -G redpanda \
35 | && chmod o+rx /app/kminion \
36 | && apk upgrade --no-cache
37 | USER redpanda
38 |
39 | ENTRYPOINT ["/app/kminion"]
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 CloudHut
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus Exporter for Apache Kafka - KMinion
2 |
3 | KMinion (previously known as Kafka Minion) is a feature-rich and flexible Prometheus Exporter to monitor your Apache
4 | Kafka cluster. All valuable information that are accessible via the Kafka protocol are supposed to be accessible using
5 | KMinion.
6 |
7 | ## 🚀 Features
8 |
9 | - **Kafka versions:** Supports all Kafka versions v0.11+
10 | - **Supported SASL mechanisms:** plain, scram-sha-256/512, gssapi/kerberos
11 | - **TLS support:** TLS is supported, regardless whether you need mTLS, a custom CA, encrypted keys or just the trusted
12 | root certs
13 | - **Consumer Group Lags:** Number of messages a consumer group is lagging behind the latest offset
14 | - **Log dir sizes:** Metric for log dir sizes either grouped by broker or by topic
15 | - **Broker info:** Metric for each broker with its address, broker id, controller and rack id
16 | - **Configurable granularity:** Export metrics (e.g. consumer group lags) either per partition or per topic. Helps to reduce the number of exported metric series.
17 | - **End to End Monitoring:** Sends messages to its own topic and consumes them, measuring a messages real-world "roundtrip" latency. Also provides ack-latency and offset-commit-latency. [More Info](/docs/end-to-end.md)
18 | - **Configurable targets:** You can configure what topics or groups you'd like to export using regex expressions
19 | - **Multiple config parsers:** It's possible to configure KMinion using YAML, Environment variables or a mix of both
20 |
21 | You can find a list of all exported metrics here: [/docs/metrics.md](/docs/metrics.md)
22 |
23 | ## Getting started
24 |
25 | ### 🐳 Docker image
26 |
27 | All images will be built on each push to master or for every new release. You can find an overview of all available tags
28 | in our [DockerHub repository](https://hub.docker.com/r/redpandadata/kminion/tags).
29 |
30 | ```shell
31 | docker pull redpandadata/kminion:latest
32 | ```
33 |
34 | ### ☸ Helm chart
35 |
36 | A Helm chart will be maintained as part of Redpanda's [helm-charts](https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion) repository.
37 |
38 | ### 🔧 Configuration
39 |
40 | All options in KMinion can be configured via YAML or environment variables. Configuring some options via YAML and some
41 | via environment variables is also possible. Environment variables take precedence in this case. You can find the
42 | reference config with additional documentation in [/docs/reference-config.yaml](/docs/reference-config.yaml).
43 |
44 | If you want to use a YAML config file, specify the path to the config file by setting the env variable
45 | `CONFIG_FILEPATH`.
46 |
47 | ### 📊 Grafana Dashboards
48 |
49 | I uploaded three separate Grafana dashboards that can be used as inspiration in order to create your own dashboards. Please take note that these dashboards might not immediately work for you due to different labeling in your Prometheus config.
50 |
51 | Cluster Dashboard: https://grafana.com/grafana/dashboards/14012
52 |
53 | Consumer Group Dashboard: https://grafana.com/grafana/dashboards/14014
54 |
55 | Topic Dashboard: https://grafana.com/grafana/dashboards/14013
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 | ### ⚡ Testing locally
64 |
65 | This repo contains a docker-compose file that you can run on your machine. It will spin up a Kafka & ZooKeeper cluster
66 | and starts KMinion on port 8080 which is exposed to your host machine:
67 |
68 | ```shell
69 | # 1. Clone this repo
70 | # 2. Browse to the repo's root directory and run:
71 | docker-compose up
72 | ```
73 |
74 | ## Chat with us
75 |
76 | We use Slack to communicate. If you are looking for more interactive discussions or support, you are invited to join
77 | our Slack server: https://redpanda.com/slack
78 |
79 | ## License
80 |
81 | KMinion is distributed under the [MIT License](https://github.com/cloudhut/kminion/blob/master/LICENSE).
82 |
--------------------------------------------------------------------------------
/charts/README.md:
--------------------------------------------------------------------------------
1 | # Helm Chart
2 |
3 | ⚠️ This chart has been moved to https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion . Please install this chart instead. The existing archives are still being hosted here, to not break existing deployments.
4 |
5 | ---
6 |
7 | This chart is intentionally very light on input validation. The goal was to offer a flexible Helm chart that allows
8 | users to deploy KMinion the way they want to. Therefore it's very flexible at the cost of less input validation, so that
9 | you might run into runtime errors for a misconfiguration.
10 |
11 | All available input is documented inside of the [values.yaml](./kminion/values.yaml) file.
12 |
13 | ## Installing the Helm chart
14 |
15 | ```shell
16 | helm repo add kminion https://raw.githubusercontent.com/cloudhut/kminion/master/charts/archives
17 | helm repo update
18 | helm install -f values.yaml kminion kminion/kminion
19 | ```
20 |
--------------------------------------------------------------------------------
/charts/archives/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/.gitkeep
--------------------------------------------------------------------------------
/charts/archives/index.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | entries:
3 | kminion:
4 | - apiVersion: v2
5 | appVersion: v2.2.5
6 | created: "2023-07-03T16:38:22.568312+01:00"
7 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
8 | Kafka
9 | digest: 32e2ee36d0b0a045061d4e1490780fef905b4c85d7a23659819c5cb128aaa119
10 | name: kminion
11 | type: application
12 | urls:
13 | - kminion-0.12.0.tgz
14 | version: 0.12.0
15 | - apiVersion: v2
16 | appVersion: v2.2.5
17 | created: "2023-07-03T16:38:22.567922+01:00"
18 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
19 | Kafka
20 | digest: 8a7be130d57f6f8ead720277b69319ff4dcd364859e80f4750416abe5ed460c3
21 | name: kminion
22 | type: application
23 | urls:
24 | - kminion-0.11.3.tgz
25 | version: 0.11.3
26 | - apiVersion: v2
27 | appVersion: v2.2.3
28 | created: "2023-07-03T16:38:22.5675+01:00"
29 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
30 | Kafka
31 | digest: 42991a871f58b6d31a9e5b38539eb3d1e9cd35c0097a0fcf63f21f818fa7a999
32 | name: kminion
33 | type: application
34 | urls:
35 | - kminion-0.11.2.tgz
36 | version: 0.11.2
37 | - apiVersion: v2
38 | appVersion: v2.2.3
39 | created: "2023-07-03T16:38:22.566877+01:00"
40 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
41 | Kafka
42 | digest: 65d7231f1e8ee586bec42bc383b66726d596fe03e0f3183e14b688174a3a8112
43 | name: kminion
44 | type: application
45 | urls:
46 | - kminion-0.11.1.tgz
47 | version: 0.11.1
48 | - apiVersion: v2
49 | appVersion: v2.2.0
50 | created: "2023-07-03T16:38:22.575384+01:00"
51 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
52 | Kafka
53 | digest: 358bdd509f573049d4bfe77d2edb94c7ad3938f609aea11a8e2c2dc65cca2a9a
54 | name: kminion
55 | type: application
56 | urls:
57 | - kminion-0.9.0.tgz
58 | version: 0.9.0
59 | - apiVersion: v2
60 | appVersion: v2.2.0
61 | created: "2023-07-03T16:38:22.574906+01:00"
62 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
63 | Kafka
64 | digest: be8f0047b345d3954fc7c7e7f8953a848c909ef253107d6e77ed747843ddd167
65 | name: kminion
66 | type: application
67 | urls:
68 | - kminion-0.8.3.tgz
69 | version: 0.8.3
70 | - apiVersion: v2
71 | appVersion: v2.1.0
72 | created: "2023-07-03T16:38:22.573746+01:00"
73 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
74 | Kafka
75 | digest: 888bc665cddc6b6b99af1ce6dd1dea0b107a2e928dff6bfe1c077bc741e20ef7
76 | name: kminion
77 | type: application
78 | urls:
79 | - kminion-0.8.2.tgz
80 | version: 0.8.2
81 | - apiVersion: v2
82 | appVersion: v2.1.0
83 | created: "2023-07-03T16:38:22.573271+01:00"
84 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
85 | Kafka
86 | digest: e59c5d5574f162708bf1434c266acbfd9040a89aa7a4abd4a0db70885248e38d
87 | name: kminion
88 | type: application
89 | urls:
90 | - kminion-0.8.1.tgz
91 | version: 0.8.1
92 | - apiVersion: v2
93 | appVersion: v2.1.0
94 | created: "2023-07-03T16:38:22.572697+01:00"
95 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
96 | Kafka
97 | digest: f54d8236f8cf03c863b53e077e1647164ffe2a7c34e1cf77101fa3312c589706
98 | name: kminion
99 | type: application
100 | urls:
101 | - kminion-0.8.0.tgz
102 | version: 0.8.0
103 | - apiVersion: v2
104 | appVersion: v2.1.0
105 | created: "2023-07-03T16:38:22.572269+01:00"
106 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
107 | Kafka
108 | digest: 4cc64cd9f78bd55673b00612579157e493020fb76440abbef10fe5152aef9acc
109 | name: kminion
110 | type: application
111 | urls:
112 | - kminion-0.7.0.tgz
113 | version: 0.7.0
114 | - apiVersion: v2
115 | appVersion: v2.1.0
116 | created: "2023-07-03T16:38:22.571852+01:00"
117 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
118 | Kafka
119 | digest: 0955e04fe9ef4b516fb0d9ed439ae79778ccdffcf817f09099790cb7e183e4d4
120 | name: kminion
121 | type: application
122 | urls:
123 | - kminion-0.6.0.tgz
124 | version: 0.6.0
125 | - apiVersion: v2
126 | appVersion: v2.0.0
127 | created: "2023-07-03T16:38:22.571391+01:00"
128 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
129 | Kafka
130 | digest: d3eb64d05535e136802538662eef7e9fdfdb3f0b93b6a42dfdcc93ee7deeadbd
131 | name: kminion
132 | type: application
133 | urls:
134 | - kminion-0.5.0.tgz
135 | version: 0.5.0
136 | - apiVersion: v2
137 | appVersion: v2.0.0
138 | created: "2023-07-03T16:38:22.570618+01:00"
139 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
140 | Kafka
141 | digest: 6b4209352d1dffd7873791ee1573dc325eb08d67656b01b430729f45dea4c09a
142 | name: kminion
143 | type: application
144 | urls:
145 | - kminion-0.4.0.tgz
146 | version: 0.4.0
147 | - apiVersion: v2
148 | appVersion: v2.0.0
149 | created: "2023-07-03T16:38:22.570281+01:00"
150 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
151 | Kafka
152 | digest: c51e3b45791e9fd51f33036916b0d36f7ac695e2fa916a9e99882ea83914ed97
153 | name: kminion
154 | type: application
155 | urls:
156 | - kminion-0.3.1.tgz
157 | version: 0.3.1
158 | - apiVersion: v2
159 | appVersion: v2.0.0
160 | created: "2023-07-03T16:38:22.569892+01:00"
161 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
162 | Kafka
163 | digest: a2be2dd8a02dc5222ec7386195a0e25b2682a39bbdcf52b60793c171acac7653
164 | name: kminion
165 | type: application
166 | urls:
167 | - kminion-0.3.0.tgz
168 | version: 0.3.0
169 | - apiVersion: v2
170 | appVersion: v2.0.0
171 | created: "2023-07-03T16:38:22.569445+01:00"
172 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
173 | Kafka
174 | digest: 321b6d5ff95ce310d2a3257b3d55f9ced51de99af6519d6d91723d7bdb6456fa
175 | name: kminion
176 | type: application
177 | urls:
178 | - kminion-0.2.2.tgz
179 | version: 0.2.2
180 | - apiVersion: v2
181 | appVersion: v2.0.0
182 | created: "2023-07-03T16:38:22.569089+01:00"
183 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
184 | Kafka
185 | digest: ed57df27158521a1eb33d215731fcc3248c71b3f36a4a029eb2d3a7b617ca519
186 | name: kminion
187 | type: application
188 | urls:
189 | - kminion-0.2.1.tgz
190 | version: 0.2.1
191 | - apiVersion: v2
192 | appVersion: v2.0.0
193 | created: "2023-07-03T16:38:22.568694+01:00"
194 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
195 | Kafka
196 | digest: 025661ee7cc574ad8dde7a68093a3b614fc92e26dd5dd398fc89d0b5308010e1
197 | name: kminion
198 | type: application
199 | urls:
200 | - kminion-0.2.0.tgz
201 | version: 0.2.0
202 | - apiVersion: v2
203 | appVersion: v2.0.0
204 | created: "2023-07-03T16:38:22.566269+01:00"
205 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
206 | Kafka
207 | digest: e277e976d864b4bd2e505038dd865a9300486ae8c4323d3f0be40b84df75732b
208 | name: kminion
209 | type: application
210 | urls:
211 | - kminion-0.1.3.tgz
212 | version: 0.1.3
213 | - apiVersion: v2
214 | appVersion: v2.0.0
215 | created: "2023-07-03T16:38:22.565773+01:00"
216 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
217 | Kafka
218 | digest: 562937d3613624c55984e51adbc6765e7898d1cf8cc2d7d241b6d671bbc12303
219 | name: kminion
220 | type: application
221 | urls:
222 | - kminion-0.1.2.tgz
223 | version: 0.1.2
224 | - apiVersion: v2
225 | appVersion: v2.0.0
226 | created: "2023-07-03T16:38:22.562776+01:00"
227 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
228 | Kafka
229 | digest: 25e83d7c7cc92a63268d76b13ecc13077758b48be093490f281498a4f55ad3ca
230 | name: kminion
231 | type: application
232 | urls:
233 | - kminion-0.1.1.tgz
234 | version: 0.1.1
235 | - apiVersion: v2
236 | appVersion: v2.0.0
237 | created: "2023-07-03T16:38:22.562046+01:00"
238 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
239 | Kafka
240 | digest: 7c10e9d9957e9752bc6f4b4a1fffb742d88cd57be06bf4f26ff7b5031645ccbd
241 | name: kminion
242 | type: application
243 | urls:
244 | - kminion-0.1.0.tgz
245 | version: 0.1.0
246 | generated: "2023-07-03T16:38:22.560328+01:00"
247 |
--------------------------------------------------------------------------------
/charts/archives/kminion-0.1.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.1.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.1.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.1.2.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.2.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.1.3.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.3.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.11.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.1.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.11.2.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.2.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.11.3.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.3.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.12.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.12.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.2.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.2.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.1.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.2.2.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.2.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.3.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.3.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.3.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.3.1.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.4.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.4.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.5.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.5.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.6.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.6.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.7.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.7.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.8.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.0.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.8.1.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.1.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.8.2.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.2.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.8.3.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.3.tgz
--------------------------------------------------------------------------------
/charts/archives/kminion-0.9.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.9.0.tgz
--------------------------------------------------------------------------------
/charts/kminion/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/charts/kminion/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: kminion
3 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.12.0
19 |
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "v2.2.5"
25 |
--------------------------------------------------------------------------------
/charts/kminion/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | 1. Get the application URL by running these commands:
2 | {{- if .Values.ingress.enabled }}
3 | {{- range .Values.ingress.hosts }}
4 | http://{{ . }}
5 | {{- end }}
6 | {{- else if contains "NodePort" .Values.service.type }}
7 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kminion.fullname" . }})
8 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
9 | echo http://$NODE_IP:$NODE_PORT
10 | {{- else if contains "LoadBalancer" .Values.service.type }}
11 | NOTE: It may take a few minutes for the LoadBalancer IP to be available.
12 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kminion.fullname" . }}'
13 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kminion.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
14 | echo http://$SERVICE_IP:{{ .Values.service.port }}
15 | {{- else if contains "ClusterIP" .Values.service.type }}
16 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kminion.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
17 | export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
18 | echo "Visit http://127.0.0.1:8080 to use your application"
19 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
20 | {{- end }}
21 |
--------------------------------------------------------------------------------
/charts/kminion/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | Expand the name of the chart.
3 | */}}
4 | {{- define "kminion.name" -}}
5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6 | {{- end }}
7 |
8 | {{/*
9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "kminion.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 |
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "kminion.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 |
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "kminion.labels" -}}
37 | helm.sh/chart: {{ include "kminion.chart" . }}
38 | {{ include "kminion.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- if .Values.customLabels}}
44 | {{ toYaml .Values.customLabels }}
45 | {{- end}}
46 | {{- end }}
47 |
48 | {{/*
49 | Selector labels
50 | */}}
51 | {{- define "kminion.selectorLabels" -}}
52 | app.kubernetes.io/name: {{ include "kminion.name" . }}
53 | app.kubernetes.io/instance: {{ .Release.Name }}
54 | {{- end }}
55 |
56 | {{/*
57 | Create the name of the service account to use
58 | */}}
59 | {{- define "kminion.serviceAccountName" -}}
60 | {{- if .Values.serviceAccount.create }}
61 | {{- default (include "kminion.fullname" .) .Values.serviceAccount.name }}
62 | {{- else }}
63 | {{- default "default" .Values.serviceAccount.name }}
64 | {{- end }}
65 | {{- end }}
66 |
67 | {{/*
68 | Return the appropriate apiVersion for ingress.
69 | */}}
70 | {{- define "kminion.ingress.apiVersion" -}}
71 | {{- if and ($.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) }}
72 | {{- print "networking.k8s.io/v1" }}
73 | {{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" }}
74 | {{- print "networking.k8s.io/v1beta1" }}
75 | {{- else }}
76 | {{- print "extensions/v1beta1" }}
77 | {{- end }}
78 | {{- end }}
79 | {{/*
80 | Return if ingress is stable.
81 | */}}
82 | {{- define "kminion.ingress.isStable" -}}
83 | {{- eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1" }}
84 | {{- end }}
85 | {{/*
86 | Return if ingress supports ingressClassName.
87 | */}}
88 | {{- define "kminion.ingress.supportsIngressClassName" -}}
89 | {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }}
90 | {{- end }}
91 |
92 | {{/*
93 | Return if ingress supports pathType.
94 | */}}
95 | {{- define "kminion.ingress.supportsPathType" -}}
96 | {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }}
97 | {{- end }}
98 |
99 | {{/*
100 | Return the appropriate apiVersion for podDisruptionBudget.
101 | */}}
102 | {{- define "kminion.podDisruptionBudget.apiVersion" -}}
103 | {{- if $.Capabilities.APIVersions.Has "policy/v1/PodDisruptionBudget" }}
104 | {{- print "policy/v1" }}
105 | {{- else }}
106 | {{- print "policy/v1beta1" }}
107 | {{- end }}
108 | {{- end }}
109 |
--------------------------------------------------------------------------------
/charts/kminion/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: {{include "kminion.fullname" .}}
5 | namespace: {{ .Release.Namespace | quote }}
6 | labels:
7 | {{- include "kminion.labels" . | nindent 4}}
8 | data:
9 | config.yaml: |
10 | {{- toYaml .Values.kminion.config | nindent 4}}
11 |
--------------------------------------------------------------------------------
/charts/kminion/templates/daemonset.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.daemonset.enabled }}
2 | apiVersion: apps/v1
3 | kind: DaemonSet
4 | metadata:
5 | name: {{include "kminion.fullname" .}}
6 | namespace: {{ .Release.Namespace | quote }}
7 | labels:
8 | {{- include "kminion.labels" . | nindent 4}}
9 | spec:
10 | updateStrategy:
11 | type: OnDelete
12 | selector:
13 | matchLabels:
14 | {{- include "kminion.selectorLabels" . | nindent 6}}
15 | template:
16 | metadata:
17 | {{- with .Values.podAnnotations}}
18 | annotations:
19 | {{- toYaml . | nindent 8}}
20 | {{- end}}
21 | labels:
22 | {{- include "kminion.selectorLabels" . | nindent 8}}
23 | {{- if .Values.customLabels}}
24 | {{toYaml .Values.customLabels | nindent 8}}
25 | {{- end}}
26 | spec:
27 | {{- with .Values.imagePullSecrets}}
28 | imagePullSecrets:
29 | {{- toYaml . | nindent 8}}
30 | {{- end}}
31 | securityContext:
32 | {{- toYaml .Values.podSecurityContext | nindent 8}}
33 | serviceAccountName: {{ .Values.serviceAccount.name }}
34 | volumes:
35 | - name: config
36 | configMap:
37 | name: {{include "kminion.fullname" .}}
38 | {{- range .Values.deployment.volumes.secrets}}
39 | - name: {{.secretName}}
40 | secret:
41 | secretName: {{.secretName}}
42 | {{- end}}
43 | containers:
44 | - name: {{.Chart.Name}}
45 | securityContext:
46 | {{- toYaml .Values.securityContext | nindent 12}}
47 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
48 | imagePullPolicy: {{.Values.image.pullPolicy}}
49 | ports:
50 | - name: metrics
51 | containerPort: {{.Values.service.port}}
52 | protocol: TCP
53 | env:
54 | - name: POD_NAME
55 | valueFrom:
56 | fieldRef:
57 | fieldPath: metadata.name
58 | - name: POD_NAMESPACE
59 | valueFrom:
60 | fieldRef:
61 | fieldPath: metadata.namespace
62 | - name: CONFIG_FILEPATH
63 | value: /etc/kminion/config.yaml
64 | {{- range .Values.deployment.env.values}}
65 | - name: {{.name}}
66 | value: {{.value | quote}}
67 | {{- end}}
68 | {{- range .Values.deployment.env.secretKeyRefs}}
69 | - name: {{.name}}
70 | valueFrom:
71 | secretKeyRef:
72 | name: {{.secretName}}
73 | key: {{.secretKey}}
74 | {{- end}}
75 | {{- range .Values.deployment.env.configMapKeyRefs}}
76 | - name: {{.name}}
77 | valueFrom:
78 | configMapKeyRef:
79 | name: {{.configMapName}}
80 | key: {{.configMapKey}}
81 | {{- end}}
82 | volumeMounts:
83 | - name: config
84 | mountPath: /etc/kminion
85 | {{- range .Values.deployment.volumes.secrets}}
86 | - name: {{.secretName}}
87 | mountPath: {{.mountPath}}
88 | {{- end}}
89 | resources:
90 | {{- toYaml .Values.resources | nindent 12}}
91 | livenessProbe:
92 | failureThreshold: 3
93 | httpGet:
94 | path: /ready
95 | port: metrics
96 | scheme: HTTP
97 | initialDelaySeconds: 10
98 | periodSeconds: 10
99 | successThreshold: 1
100 | timeoutSeconds: 1
101 | readinessProbe:
102 | failureThreshold: 3
103 | httpGet:
104 | path: /ready
105 | port: metrics
106 | scheme: HTTP
107 | periodSeconds: 10
108 | successThreshold: 1
109 | timeoutSeconds: 1
110 | {{- with .Values.affinity}}
111 | affinity:
112 | {{- toYaml . | nindent 8}}
113 | {{- end}}
114 | {{- with .Values.tolerations}}
115 | tolerations:
116 | {{- toYaml . | nindent 8}}
117 | {{- end}}
118 | {{- end }}
119 |
--------------------------------------------------------------------------------
/charts/kminion/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: {{include "kminion.fullname" .}}
5 | namespace: {{ .Release.Namespace | quote }}
6 | labels:
7 | {{- include "kminion.labels" . | nindent 4}}
8 | {{- with .Values.deployment.labels}}
9 | {{- toYaml . | nindent 4}}
10 | {{- end}}
11 | {{- with .Values.deployment.annotations}}
12 | annotations:
13 | {{- toYaml . | nindent 4}}
14 | {{- end}}
15 | spec:
16 | {{- if not .Values.autoscaling.enabled}}
17 | replicas: {{.Values.replicaCount}}
18 | {{- end}}
19 | selector:
20 | matchLabels:
21 | {{- include "kminion.selectorLabels" . | nindent 6}}
22 | template:
23 | metadata:
24 | {{- with .Values.podAnnotations}}
25 | annotations:
26 | {{- toYaml . | nindent 8}}
27 | {{- end}}
28 | labels:
29 | {{- include "kminion.selectorLabels" . | nindent 8}}
30 | {{- if .Values.customLabels}}
31 | {{toYaml .Values.customLabels | nindent 8}}
32 | {{- end}}
33 | spec:
34 | {{- with .Values.imagePullSecrets}}
35 | imagePullSecrets:
36 | {{- toYaml . | nindent 8}}
37 | {{- end}}
38 | serviceAccountName: {{include "kminion.serviceAccountName" .}}
39 | securityContext:
40 | {{- toYaml .Values.podSecurityContext | nindent 8}}
41 | volumes:
42 | - name: config
43 | configMap:
44 | name: {{include "kminion.fullname" .}}
45 | {{- range .Values.deployment.volumes.secrets}}
46 | - name: {{.secretName}}
47 | secret:
48 | secretName: {{.secretName}}
49 | {{- end}}
50 | {{- with .Values.deployment.volumes.extra }}
51 | {{- toYaml . | nindent 8 }}
52 | {{- end }}
53 | initContainers:
54 | {{- with .Values.deployment.initContainers }}
55 | {{- toYaml . | nindent 8 }}
56 | {{- end }}
57 | containers:
58 | - name: {{.Chart.Name}}
59 | securityContext:
60 | {{- toYaml .Values.securityContext | nindent 12}}
61 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
62 | imagePullPolicy: {{.Values.image.pullPolicy}}
63 | ports:
64 | - name: metrics
65 | containerPort: {{.Values.service.port}}
66 | protocol: TCP
67 | env:
68 | - name: CONFIG_FILEPATH
69 | value: /etc/kminion/config.yaml
70 | {{- range .Values.deployment.env.values}}
71 | - name: {{.name}}
72 | value: {{.value | quote}}
73 | {{- end}}
74 | {{- range .Values.deployment.env.secretKeyRefs}}
75 | - name: {{.name}}
76 | valueFrom:
77 | secretKeyRef:
78 | name: {{.secretName}}
79 | key: {{.secretKey}}
80 | {{- end}}
81 | {{- range .Values.deployment.env.configMapKeyRefs}}
82 | - name: {{.name}}
83 | valueFrom:
84 | configMapKeyRef:
85 | name: {{.configMapName}}
86 | key: {{.configMapKey}}
87 | {{- end}}
88 | volumeMounts:
89 | - name: config
90 | mountPath: /etc/kminion
91 | {{- range .Values.deployment.volumes.secrets}}
92 | - name: {{.secretName}}
93 | mountPath: {{.mountPath}}
94 | {{- end}}
95 | resources:
96 | {{- toYaml .Values.resources | nindent 12}}
97 | {{- if .Values.deployment.readinessProbe.enabled }}
98 | readinessProbe:
99 | httpGet:
100 | path: /ready
101 | port: {{.Values.service.port}}
102 | initialDelaySeconds: 10
103 | {{- end }}
104 | {{- with .Values.deployment.extraContainers }}
105 | {{- toYaml . | nindent 8 }}
106 | {{- end }}
107 | {{- with .Values.nodeSelector}}
108 | nodeSelector:
109 | {{- toYaml . | nindent 8}}
110 | {{- end}}
111 | {{- with .Values.affinity}}
112 | affinity:
113 | {{- toYaml . | nindent 8}}
114 | {{- end}}
115 | {{- with .Values.tolerations}}
116 | tolerations:
117 | {{- toYaml . | nindent 8}}
118 | {{- end}}
119 |
--------------------------------------------------------------------------------
/charts/kminion/templates/hpa.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.autoscaling.enabled }}
2 | apiVersion: {{ ternary "autoscaling/v2" "autoscaling/v2beta1" (.Capabilities.APIVersions.Has "autoscaling/v2") }}
3 | kind: HorizontalPodAutoscaler
4 | metadata:
5 | name: {{ include "kminion.fullname" . }}
6 | namespace: {{ .Release.Namespace | quote }}
7 | labels:
8 | {{- include "kminion.labels" . | nindent 4 }}
9 | spec:
10 | scaleTargetRef:
11 | apiVersion: apps/v1
12 | kind: Deployment
13 | name: {{ include "kminion.fullname" . }}
14 | minReplicas: {{ .Values.autoscaling.minReplicas }}
15 | maxReplicas: {{ .Values.autoscaling.maxReplicas }}
16 | metrics:
17 | {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
18 | - type: Resource
19 | resource:
20 | name: cpu
21 | {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }}
22 | target:
23 | type: Utilization
24 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
25 | {{ else }}
26 | targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
27 | {{- end }}
28 | {{- end }}
29 | {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
30 | - type: Resource
31 | resource:
32 | name: memory
33 | {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }}
34 | target:
35 | type: Utilization
36 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
37 | {{ else }}
38 | targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
39 | {{- end }}
40 | {{- end }}
41 | {{- end }}
42 |
--------------------------------------------------------------------------------
/charts/kminion/templates/ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.ingress.enabled -}}
2 | {{- $fullName := include "kminion.fullname" . -}}
3 | {{- $svcPort := .Values.service.port -}}
4 | {{- $ingressApiIsStable := eq (include "kminion.ingress.isStable" .) "true" -}}
5 | {{- $ingressSupportsIngressClassName := eq (include "kminion.ingress.supportsIngressClassName" .) "true" -}}
6 | {{- $ingressSupportsPathType := eq (include "kminion.ingress.supportsPathType" .) "true" -}}
7 | {{- $fullName := include "kminion.fullname" . -}}
8 | {{- $servicePort := .Values.service.port -}}
9 | {{- $ingressPath := .Values.ingress.path -}}
10 | {{- $ingressPathType := .Values.ingress.pathType -}}
11 | {{- $extraPaths := .Values.ingress.extraPaths -}}
12 |
13 | apiVersion: {{ include "kminion.ingress.apiVersion" . }}
14 | kind: Ingress
15 | metadata:
16 | name: {{ $fullName }}
17 | namespace: {{ .Release.Namespace | quote }}
18 | labels:
19 | {{- include "kminion.labels" . | nindent 4 }}
20 | {{- with .Values.ingress.annotations }}
21 | annotations:
22 | {{- toYaml . | nindent 4 }}
23 | {{- end }}
24 | spec:
25 | {{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }}
26 | ingressClassName: {{ .Values.ingress.ingressClassName }}
27 | {{- end -}}
28 | {{- with .Values.ingress.tls }}
29 | tls:
30 | {{- tpl (toYaml .) $ | nindent 4 }}
31 | {{- end }}
32 | rules:
33 | {{- if .Values.ingress.hosts }}
34 | {{- range .Values.ingress.hosts }}
35 | - host: {{ tpl . $ }}
36 | http:
37 | paths:
38 | {{- with $extraPaths }}
39 | {{- toYaml . | nindent 10 }}
40 | {{- end }}
41 | - path: {{ $ingressPath }}
42 | {{- if $ingressSupportsPathType }}
43 | pathType: {{ $ingressPathType }}
44 | {{- end }}
45 | backend:
46 | {{- if $ingressApiIsStable }}
47 | service:
48 | name: {{ $fullName }}
49 | port:
50 | number: {{ $servicePort }}
51 | {{- else }}
52 | serviceName: {{ $fullName }}
53 | servicePort: {{ $servicePort }}
54 | {{- end }}
55 | {{- end }}
56 | {{- else }}
57 | - http:
58 | paths:
59 | - backend:
60 | {{- if $ingressApiIsStable }}
61 | service:
62 | name: {{ $fullName }}
63 | port:
64 | number: {{ $servicePort }}
65 | {{- else }}
66 | serviceName: {{ $fullName }}
67 | servicePort: {{ $servicePort }}
68 | {{- end }}
69 | {{- with $ingressPath }}
70 | path: {{ . }}
71 | {{- end }}
72 | {{- if $ingressSupportsPathType }}
73 | pathType: {{ $ingressPathType }}
74 | {{- end }}
75 | {{- end -}}
76 | {{- end }}
77 |
--------------------------------------------------------------------------------
/charts/kminion/templates/poddisruptionbudget.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.podDisruptionBudget }}
2 | apiVersion: {{ include "kminion.podDisruptionBudget.apiVersion" . }}
3 | kind: PodDisruptionBudget
4 | metadata:
5 | name: {{ template "kminion.fullname" . }}
6 | namespace: {{ .Release.Namespace | quote }}
7 | labels:
8 | {{- include "kminion.labels" . | nindent 4}}
9 | spec:
10 | {{- if .Values.podDisruptionBudget.minAvailable }}
11 | minAvailable: {{ .Values.podDisruptionBudget.minAvailable }}
12 | {{- end }}
13 | {{- if .Values.podDisruptionBudget.maxUnavailable }}
14 | maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable }}
15 | {{- end }}
16 | selector:
17 | matchLabels:
18 | {{- include "kminion.selectorLabels" . | nindent 6}}
19 | {{- end }}
20 |
--------------------------------------------------------------------------------
/charts/kminion/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "kminion.fullname" . }}
5 | namespace: {{ .Release.Namespace | quote }}
6 | labels:
7 | {{- include "kminion.labels" . | nindent 4 }}
8 | {{- if .Values.service.annotations }}
9 | annotations:
10 | {{- toYaml .Values.service.annotations | nindent 4 }}
11 | {{- end }}
12 | spec:
13 | type: {{ .Values.service.type }}
14 | ports:
15 | - port: {{ .Values.service.port }}
16 | targetPort: metrics
17 | protocol: TCP
18 | name: metrics
19 | {{- if .Values.service.extraPorts }}
20 | {{- toYaml .Values.service.extraPorts | nindent 4 }}
21 | {{- end }}
22 | selector:
23 | {{- include "kminion.selectorLabels" . | nindent 4 }}
24 |
--------------------------------------------------------------------------------
/charts/kminion/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceAccount.create -}}
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: {{ include "kminion.serviceAccountName" . }}
6 | namespace: {{ .Release.Namespace | quote }}
7 | labels:
8 | {{- include "kminion.labels" . | nindent 4 }}
9 | {{- with .Values.serviceAccount.annotations }}
10 | annotations:
11 | {{- toYaml . | nindent 4 }}
12 | {{- end }}
13 | {{- end }}
14 |
--------------------------------------------------------------------------------
/charts/kminion/templates/servicemonitor.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceMonitor.create }}
2 | apiVersion: monitoring.coreos.com/v1
3 | kind: ServiceMonitor
4 | metadata:
5 | name: {{include "kminion.fullname" .}}
6 | namespace: {{ .Release.Namespace | quote }}
7 | labels:
8 | {{- include "kminion.labels" . | nindent 4}}
9 | {{- if .Values.serviceMonitor.additionalLabels}}
10 | {{toYaml .Values.serviceMonitor.additionalLabels | nindent 4}}
11 | {{- end}}
12 | spec:
13 | selector:
14 | matchLabels:
15 | {{- include "kminion.labels" . | nindent 6}}
16 | endpoints:
17 | - port: metrics
18 | path: /metrics
19 | honorLabels: {{ .Values.serviceMonitor.honorLabels }}
20 | scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }}
21 | interval: {{ .Values.serviceMonitor.interval }}
22 | {{- if .Values.serviceMonitor.relabelings }}
23 | relabelings:
24 | {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
25 | {{- end }}
26 | {{- if .Values.serviceMonitor.targetLabels}}
27 | targetLabels:
28 | {{- toYaml .Values.serviceMonitor.targetLabels | nindent 4}}
29 | {{- end}}
30 | {{- if .Values.customLabels }}
31 | podTargetLabels:
32 | {{- (keys .Values.customLabels | sortAlpha) | toYaml | nindent 4 }}
33 | {{- end}}
34 | {{- end }}
35 |
--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "strings"
7 |
8 | "github.com/cloudhut/kminion/v2/kafka"
9 | "github.com/cloudhut/kminion/v2/logging"
10 | "github.com/cloudhut/kminion/v2/minion"
11 | "github.com/cloudhut/kminion/v2/prometheus"
12 | "github.com/knadh/koanf"
13 | "github.com/knadh/koanf/parsers/yaml"
14 | "github.com/knadh/koanf/providers/env"
15 | "github.com/knadh/koanf/providers/file"
16 | "github.com/mitchellh/mapstructure"
17 | "go.uber.org/zap"
18 | )
19 |
20 | type Config struct {
21 | Kafka kafka.Config `koanf:"kafka"`
22 | Minion minion.Config `koanf:"minion"`
23 | Exporter prometheus.Config `koanf:"exporter"`
24 | Logger logging.Config `koanf:"logger"`
25 | }
26 |
27 | func (c *Config) SetDefaults() {
28 | c.Kafka.SetDefaults()
29 | c.Minion.SetDefaults()
30 | c.Exporter.SetDefaults()
31 | c.Logger.SetDefaults()
32 | }
33 |
34 | func (c *Config) Validate() error {
35 | err := c.Kafka.Validate()
36 | if err != nil {
37 | return fmt.Errorf("failed to validate kafka config: %w", err)
38 | }
39 |
40 | err = c.Minion.Validate()
41 | if err != nil {
42 | return fmt.Errorf("failed to validate minion config: %w", err)
43 | }
44 |
45 | err = c.Logger.Validate()
46 | if err != nil {
47 | return fmt.Errorf("failed to validate logger config: %w", err)
48 | }
49 |
50 | return nil
51 | }
52 |
53 | func newConfig(logger *zap.Logger) (Config, error) {
54 | k := koanf.New(".")
55 | var cfg Config
56 | cfg.SetDefaults()
57 |
58 | // 1. Check if a config filepath is set via flags. If there is one we'll try to load the file using a YAML Parser
59 | envKey := "CONFIG_FILEPATH"
60 | configFilepath := os.Getenv(envKey)
61 | if configFilepath == "" {
62 | logger.Info("the env variable '" + envKey + "' is not set, therefore no YAML config will be loaded")
63 | } else {
64 | err := k.Load(file.Provider(configFilepath), yaml.Parser())
65 | if err != nil {
66 | return Config{}, fmt.Errorf("failed to parse YAML config: %w", err)
67 | }
68 | }
69 |
70 | // We could unmarshal the loaded koanf input after loading both providers, however we want to unmarshal the YAML
71 | // config with `ErrorUnused` set to true, but unmarshal environment variables with `ErrorUnused` set to false (default).
72 | // Rationale: Orchestrators like Kubernetes inject unrelated environment variables, which we still want to allow.
73 | err := k.UnmarshalWithConf("", &cfg, koanf.UnmarshalConf{
74 | Tag: "",
75 | FlatPaths: false,
76 | DecoderConfig: &mapstructure.DecoderConfig{
77 | DecodeHook: mapstructure.ComposeDecodeHookFunc(
78 | mapstructure.StringToTimeDurationHookFunc()),
79 | Metadata: nil,
80 | Result: &cfg,
81 | WeaklyTypedInput: true,
82 | ErrorUnused: true,
83 | },
84 | })
85 | if err != nil {
86 | return Config{}, err
87 | }
88 |
89 | err = k.Load(env.ProviderWithValue("", ".", func(s string, v string) (string, interface{}) {
90 | // key := strings.Replace(strings.ToLower(s), "_", ".", -1)
91 | key := strings.Replace(strings.ToLower(s), "_", ".", -1)
92 | // Check to exist if we have a configuration option already and see if it's a slice
93 | // If there is a comma in the value, split the value into a slice by the comma.
94 | if strings.Contains(v, ",") {
95 | return key, strings.Split(v, ",")
96 | }
97 |
98 | // Otherwise return the new key with the unaltered value
99 | return key, v
100 | }), nil)
101 | if err != nil {
102 | return Config{}, err
103 | }
104 |
105 | err = k.Unmarshal("", &cfg)
106 | if err != nil {
107 | return Config{}, err
108 | }
109 |
110 | err = cfg.Validate()
111 | if err != nil {
112 | return Config{}, fmt.Errorf("failed to validate config: %w", err)
113 | }
114 |
115 | return cfg, nil
116 | }
117 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: '2.1'
3 |
4 | services:
5 |
6 | zookeeper:
7 | image: confluentinc/cp-zookeeper:latest
8 | ports:
9 | - 2181:2181
10 | environment:
11 | ZOOKEEPER_CLIENT_PORT: 2181
12 | ZOOKEEPER_TICK_TIME: 2000
13 | container_name: zookeeper
14 | hostname: zookeeper
15 |
16 | kafka:
17 | image: confluentinc/cp-kafka:latest
18 | hostname: kafka
19 | container_name: kafka
20 | depends_on:
21 | - zookeeper
22 | ports:
23 | - 9092:9092
24 | environment:
25 | KAFKA_BROKER_ID: 1
26 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
28 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
29 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
30 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
31 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
32 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
33 |
34 | kafka-minion:
35 | build:
36 | context: .
37 | dockerfile: ./Dockerfile
38 | hostname: kafka-minion
39 | container_name: kafka-minion
40 | depends_on:
41 | - zookeeper
42 | - kafka
43 | ports:
44 | - 8080:8080
45 | environment:
46 | KAFKA_BROKERS: kafka:29092
47 | restart: unless-stopped
--------------------------------------------------------------------------------
/docs/end-to-end.md:
--------------------------------------------------------------------------------
1 | # End-To-End Monitoring
2 |
3 | This page describes the end-to-end monitoring feature in KMinion, how it works, and what metrics it provides.
4 |
5 | ## Motivation
6 |
7 | > What is the issue? Why did we build this feature?
8 |
9 | We can monitor metrics like CPU usage, free disk space, or even consumer group lag. However, these metrics don't give us
10 | a good idea of the performance characteristics an actual, real-world, client experiences when connected to the cluster.
11 |
12 | With the "classic" metrics lots of questions go unanswered:
13 |
14 | - Can a client produce messages to the cluster?
15 | - Can clients produce & consume messages as well as commit group offsets with an acceptable latency?
16 | - Is the cluster in a healthy state from a client's perspective?
17 |
18 | ## Approach & Implementation
19 |
20 | > How do we solve those issues? How does the feature work?
21 |
22 | The most reliably way to get real-world performance and availability metrics is to actually run a producer/consumer
23 | ourselves. This is exactly what the end-to-end monitoring feature does!
24 |
25 | ## High Level Overview
26 |
27 | In order to determine if the cluster is fully operational, and it's performance is within acceptable limits, KMinion
28 | continuously produces and consumes messages to/from the cluster. That way we can measure things like ack-latency,
29 | commit-latency, and roundtrip-time.
30 |
31 | KMinion creates and manages its own topic for the end-to-end test messages. The name of the topic can be configured.
32 |
33 | **The first step** is to create a message and send it to the cluster.
34 |
35 | - Every produced message is added to an internal tracker, so we can recognize messages being "lost". A message is
36 | considered lost if it doesn't arrive back at the consumer within the configured time span.
37 |
38 | **The second step** is to continuously consume the topic.
39 |
40 | - As each message arrives, we calculate its roundtrip time (time from the point the message was created, until KMinion
41 | received it again)
42 | - Consumer group offsets are committed periodically, while also recording the time each commit takes.
43 |
44 | ### Topic Management
45 |
46 | The topic KMinion uses, is created and managed completely automatically (the topic name can be configured though).
47 |
48 | KMinion continuously checks the topic and fixes issues/imbalances automatically:
49 |
50 | - Add partitions to the topic, so it has at least as many partitions as there are brokers.
51 | - Will reassign partitions to ensure every broker leads at least one partition, and that all partitions' replicas are
52 | distributed evenly across the brokers. KMinion tries to assign partitionIDs to brokers that have the same broker id.
53 |
54 | ### Consumer Group Management
55 |
56 | On startup each KMinion instance generates a unique identifier (UUID) that is used to create its own consumer group. It
57 | incorporates the shared prefix from the config.
58 |
59 | That is necessary because:
60 |
61 | - Offsets must not be shared among multiple instances.
62 | - Each instance must always consume **all** partitions of the topic.
63 |
64 | The instances' UUID is also embedded in every message, so each instance can easily filter out messages it didn't
65 | produce. That's why it is perfectly fine to run multiple KMinion instances against the same cluster, using the same
66 | topic.
67 |
68 | KMinion also monitors and deletes consumer groups that use it's configured prefix. That way, when an instance
69 | exits/restarts, previous consumer groups will be cleaned up quickly (check happens every 20s).
70 |
71 | ## Available Metrics
72 |
73 | The end-to-end monitoring feature exports the following metrics.
74 |
75 | ### Counters
76 |
77 | | Name | Description |
78 | | --- | --- |
79 | | `kminion_end_to_end_messages_produced_total ` | Messages KMinion *tried* to send |
80 | | `kminion_end_to_end_messages_received_total ` | Number of messages received (only counts those that match, i.e. that this instance actually produced itself) |
81 | | `kminion_end_to_end_offset_commits_total` | Number of successful offset commits |
82 | | `kminion_end_to_end_messages_lost_total` Number of messages that have been produced successfully but not received within the configured SLA duration |
83 | | `kminion_end_to_end_messages_produced_failed_total` Number of messages failed to produce to Kafka because of a timeout or failure |
84 | | `kminion_end_to_end_offset_commits_total` Counts how many times kminions end-to-end test has committed offsets |
85 |
86 | ### Histograms
87 |
88 | | Name | Description |
89 | | --- | --- |
90 | | `kminion_end_to_end_produce_latency_seconds ` | Duration until the cluster acknowledged a message. |
91 | | `kminion_end_to_end_offset_commit_latency_seconds` Time kafka took to respond to kminion's offset commit |
92 | | `kminion_end_to_end_roundtrip_latency_seconds ` | Duration from creation of a message, until it was received/consumed again. |
93 |
94 | ### Gauges
95 | | Name | Description |
96 | | --- | --- |
97 | | `kminion_end_to_end_messages_produced_in_flight` Number of messages that kminion's end-to-end test produced but has not received an answer for yet |
98 |
99 | ## Config Properties
100 |
101 | All config properties related to this feature are located in `minion.endToEnd`.
102 |
103 | ```yaml
104 | endToEnd:
105 | enabled: true
106 | probeInterval: 800ms # how often to send end-to-end test messages
107 | topicManagement:
108 | # You can disable topic management, without disabling the testing feature.
109 | # Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic.
110 | # It is strongly recommended to leave this enabled.
111 | enabled: true
112 |
113 | # Name of the topic kminion uses to send its test messages
114 | # You do *not* need to change this if you are running multiple kminion instances on the same cluster.
115 | # Different instances are perfectly fine with sharing the same topic!
116 | name: kminion-end-to-end
117 |
118 | # How often kminion checks its topic to validate configuration, partition count, and partition assignments
119 | reconciliationInterval: 10m
120 |
121 | # Useful for monitoring the performance of acks (if >1 this is best combined with 'producer.requiredAcks' set to 'all')
122 | replicationFactor: 1
123 |
124 | # Rarely makes sense to change this, but maybe if you want some sort of cheap load test?
125 | partitionsPerBroker: 1
126 |
127 | producer:
128 | # This defines the maximum time to wait for an ack response after producing a message,
129 | # and the upper bound for histogram buckets in "produce_latency_seconds"
130 | ackSla: 5s
131 | # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if
132 | # the message was written to all in-sync replicas of the partition.
133 | # Or can be set to "leader" to only require to have written the message to its log.
134 | requiredAcks: all
135 |
136 | consumer:
137 | # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically
138 | groupIdPrefix: kminion-end-to-end
139 |
140 | # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want
141 | # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion.
142 | deleteStaleConsumerGroups: false
143 |
144 | # Defines the time limit beyond which a message is considered "lost" (failed the roundtrip),
145 | # also used as the upper bound for histogram buckets in "roundtrip_latency"
146 | roundtripSla: 20s
147 |
148 | # Maximum time an offset commit is allowed to take before considering it failed,
149 | # also used as the upper bound for histogram buckets in "commit_latency_seconds"
150 | commitSla: 10s
151 | ```
152 |
153 |
--------------------------------------------------------------------------------
/docs/metrics.md:
--------------------------------------------------------------------------------
1 | # Exported Metrics
2 |
3 | This document lists all exported metrics in an exemplary way.
4 |
5 | ## Exporter Metrics
6 |
7 | ```
8 | # HELP kminion_exporter_up Build info about this Prometheus Exporter. Gauge value is 0 if one or more scrapes have failed.
9 | # TYPE kminion_exporter_up gauge
10 | kminion_exporter_up{version="sha-0ab0dcdf862f7a34b06998cd2d980148e048151a"} 1
11 |
12 | # HELP kminion_exporter_offset_consumer_records_consumed_total The number of offset records that have been consumed by the internal offset consumer
13 | # TYPE kminion_exporter_offset_consumer_records_consumed_total counter
14 | kminion_exporter_offset_consumer_records_consumed_total 5.058244883e+09
15 | ```
16 |
17 | ## Kafka Metrics
18 |
19 | ### General / Cluster Metrics
20 |
21 | ```
22 | # HELP kminion_kafka_broker_info Kafka broker information
23 | # TYPE kminion_kafka_broker_info gauge
24 | kminion_kafka_broker_info{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",is_controller="false",port="9092",rack_id="europe-west1-b"} 1
25 |
26 | # HELP kminion_kafka_cluster_info Kafka cluster information
27 | # TYPE kminion_kafka_cluster_info gauge
28 | kminion_kafka_cluster_info{broker_count="12",cluster_id="UYZJg8bhT_6SxhsdaQZEQ",cluster_version="v2.6",controller_id="6"} 1
29 | ```
30 |
31 | ### Log Dir Metrics
32 |
33 | ```
34 | # HELP kminion_kafka_broker_log_dir_size_total_bytes The summed size in bytes of all log dirs for a given broker
35 | # TYPE kminion_kafka_broker_log_dir_size_total_bytes gauge
36 | kminion_kafka_broker_log_dir_size_total_bytes{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",port="9092",rack_id="europe-west1-b"} 8.32654935115e+11
37 |
38 | # HELP kminion_kafka_topic_log_dir_size_total_bytes The summed size in bytes of partitions for a given topic. This includes the used space for replica partitions.
39 | # TYPE kminion_kafka_topic_log_dir_size_total_bytes gauge
40 | kminion_kafka_topic_log_dir_size_total_bytes{topic_name="__consumer_offsets"} 9.026554258e+09
41 | ```
42 |
43 | ### Topic & Partition Metrics
44 |
45 | ```
46 | # HELP kminion_kafka_topic_info Info labels for a given topic
47 | # TYPE kminion_kafka_topic_info gauge
48 | kminion_kafka_topic_info{cleanup_policy="compact",partition_count="1",replication_factor="1",topic_name="_confluent-ksql-default__command_topic"} 1
49 |
50 | # HELP kminion_kafka_topic_partition_low_water_mark Partition Low Water Mark
51 | # TYPE kminion_kafka_topic_partition_low_water_mark gauge
52 | kminion_kafka_topic_partition_low_water_mark{partition_id="0",topic_name="__consumer_offsets"} 0
53 |
54 | # HELP kminion_kafka_topic_low_water_mark_sum Sum of all the topic's partition low water marks
55 | # TYPE kminion_kafka_topic_low_water_mark_sum gauge
56 | kminion_kafka_topic_low_water_mark_sum{topic_name="__consumer_offsets"} 0
57 |
58 | # HELP kminion_kafka_topic_partition_high_water_mark Partition High Water Mark
59 | # TYPE kminion_kafka_topic_partition_high_water_mark gauge
60 | kminion_kafka_topic_partition_high_water_mark{partition_id="0",topic_name="__consumer_offsets"} 2.04952001e+08
61 |
62 | # HELP kminion_kafka_topic_high_water_mark_sum Sum of all the topic's partition high water marks
63 | # TYPE kminion_kafka_topic_high_water_mark_sum gauge
64 | kminion_kafka_topic_high_water_mark_sum{topic_name="__consumer_offsets"} 1.512023846873e+12
65 | ```
66 |
67 | ### Consumer Group Metrics
68 |
69 | ```
70 | # HELP kminion_kafka_consumer_group_info Consumer Group info metrics. It will report 1 if the group is in the stable state, otherwise 0.
71 | # TYPE kminion_kafka_consumer_group_info gauge
72 | kminion_kafka_consumer_group_info{coordinator_id="0",group_id="bigquery-sink",protocol="range",protocol_type="consumer",state="Stable"} 1
73 |
74 | # HELP kminion_kafka_consumer_group_members Consumer Group member count metrics. It will report the number of members in the consumer group
75 | # TYPE kminion_kafka_consumer_group_members gauge
76 | kminion_kafka_consumer_group_members{group_id="bigquery-sink"} 2
77 |
78 | # HELP kminion_kafka_consumer_group_empty_members Consumer Group Empty Members. It will report the number of members in the consumer group with no partition assigned
79 | # TYPE kminion_kafka_consumer_group_empty_members gauge
80 | kminion_kafka_consumer_group_empty_members{group_id="bigquery-sink"} 1
81 |
82 | # HELP kminion_kafka_consumer_group_topic_members Consumer Group topic member count metrics. It will report the number of members in the consumer group assigned on a given topic
83 | # TYPE kminion_kafka_consumer_group_topic_members gauge
84 | kminion_kafka_consumer_group_topic_members{group_id="bigquery-sink",topic_name="shop-activity"} 4
85 |
86 | # HELP kminion_kafka_consumer_group_topic_assigned_partitions Consumer Group topic partitions count metrics. It will report the number of partitions assigned in the consumer group for a given topic
87 | # TYPE kminion_kafka_consumer_group_topic_assigned_partitions gauge
88 | kminion_kafka_consumer_group_topic_assigned_partitions{group_id="bigquery-sink",topic_name="shop-activity"} 32
89 |
90 | # HELP kminion_kafka_consumer_group_topic_offset_sum The sum of all committed group offsets across all partitions in a topic
91 | # TYPE kminion_kafka_consumer_group_topic_offset_sum gauge
92 | kminion_kafka_consumer_group_topic_offset_sum{group_id="bigquery-sink",topic_name="shop-activity"} 4.259513e+06
93 |
94 | # HELP kminion_kafka_consumer_group_topic_partition_lag The number of messages a consumer group is lagging behind the latest offset of a partition
95 | # TYPE kminion_kafka_consumer_group_topic_partition_lag gauge
96 | kminion_kafka_consumer_group_topic_partition_lag{group_id="bigquery-sink",partition_id="10",topic_name="shop-activity"} 147481
97 |
98 | # HELP kminion_kafka_consumer_group_topic_lag The number of messages a consumer group is lagging behind across all partitions in a topic
99 | # TYPE kminion_kafka_consumer_group_topic_lag gauge
100 | kminion_kafka_consumer_group_topic_lag{group_id="bigquery-sink",topic_name="shop-activity"} 147481
101 |
102 | # HELP kminion_kafka_consumer_group_offset_commits_total The number of offsets committed by a group
103 | # TYPE kminion_kafka_consumer_group_offset_commits_total counter
104 | kminion_kafka_consumer_group_offset_commits_total{group_id="bigquery-sink"} 1098
105 | ```
106 |
107 | ### End-to-End Metrics
108 |
109 | ```
110 | # HELP kminion_end_to_end_messages_produced_total Number of messages that kminion's end-to-end test has tried to send to kafka
111 | # TYPE kminion_end_to_end_messages_produced_total counter
112 | kminion_end_to_end_messages_produced_total 384
113 |
114 | # HELP kminion_end_to_end_offset_commits_total Counts how many times kminions end-to-end test has committed messages
115 | # TYPE kminion_end_to_end_offset_commits_total counter
116 | kminion_end_to_end_offset_commits_total 18
117 |
118 | # HELP kminion_end_to_end_messages_received_total Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)
119 | # TYPE kminion_end_to_end_messages_received_total counter
120 | kminion_end_to_end_messages_received_total 383
121 |
122 | # HELP kminion_end_to_end_produce_latency_seconds Time until we received an ack for a produced message
123 | # TYPE kminion_end_to_end_produce_latency_seconds histogram
124 | kminion_end_to_end_produce_latency_seconds_bucket{partitionId="0",le="0.005"} 0
125 |
126 | # HELP kminion_end_to_end_offset_commit_latency_seconds Time kafka took to respond to kminion's offset commit
127 | # TYPE kminion_end_to_end_offset_commit_latency_seconds histogram
128 | kminion_end_to_end_offset_commit_latency_seconds_bucket{groupCoordinatorBrokerId="0",le="0.005"} 0
129 |
130 | # HELP kminion_end_to_end_roundtrip_latency_seconds Time it took between sending (producing) and receiving (consuming) a message
131 | # TYPE kminion_end_to_end_roundtrip_latency_seconds histogram
132 | kminion_end_to_end_roundtrip_latency_seconds_bucket{partitionId="0",le="0.005"} 0
133 |
134 | # HELP kminion_end_to_end_messages_lost_total Number of messages that have been produced successfully but not received within the configured SLA duration
135 | # TYPE kminion_end_to_end_messages_lost_total counter
136 | kminion_end_to_end_messages_lost_total{partition_id="0"} 0
137 |
138 | # HELP kminion_end_to_end_messages_produced_failed_total Number of messages failed to produce to Kafka because of a timeout or failure
139 | # TYPE kminion_end_to_end_messages_produced_failed_total counter
140 | kminion_end_to_end_messages_produced_failed_total{partition_id="0"} 0
141 |
142 | # HELP kminion_end_to_end_messages_produced_in_flight Number of messages that kminion's end-to-end test produced but has not received an answer for yet
143 | # TYPE kminion_end_to_end_messages_produced_in_flight gauge
144 | kminion_end_to_end_messages_produced_in_flight{partition_id="0"} 0
145 | ```
146 |
--------------------------------------------------------------------------------
/docs/reference-config.yaml:
--------------------------------------------------------------------------------
1 | #####################################################################################
2 | # This file documents all the available config options and it's default values.
3 | #
4 | # All config options can be configured via environment variables as well.
5 | # If you specify both the env variable and yaml option for the same configuration
6 | # the environment variable will take precedence. If you want to use a YAML config
7 | # file, specify the path to the config file by setting the env variable
8 | # CONFIG_FILEPATH.
9 | #
10 | # The env variable name is auto generated by upper casing everything and adding
11 | # an underscore for each indentation/level. Some examples:
12 | # kafka.rackId => KAFKA_RACKID
13 | # kafka.tls.caFilepath => KAFKA_TLS_CAFILEPATH
14 | # minion.consumerGroups.allowedGroups => MINION_CONSUMERGROUPS_ALLOWEDGROUPS
15 | #
16 | # Env variables that expect array values can be provided by separting them using
17 | # a comma: KAFKA_BROKERS = "broker1:9092,broker2:9092,broker3:9092"
18 | #####################################################################################
19 |
20 | logger:
21 | # Valid values are: debug, info, warn, error, fatal, panic
22 | level: info
23 |
24 | kafka:
25 | brokers: [ ]
26 | clientId: "kminion"
27 | rackId: ""
28 | tls:
29 | enabled: false
30 | caFilepath: ""
31 | certFilepath: ""
32 | keyFilepath: ""
33 | # base64 encoded tls CA, cannot be set if 'caFilepath' is set
34 | ca: ""
35 | # base64 encoded tls cert, cannot be set if 'certFilepath' is set
36 | cert: ""
37 | # base64 encoded tls key, cannot be set if 'keyFilepath' is set
38 | key: ""
39 | passphrase: ""
40 | insecureSkipTlsVerify: false
41 |
42 | sasl:
43 | # Whether or not SASL authentication will be used for authentication
44 | enabled: false
45 | # Username to use for PLAIN or SCRAM mechanism
46 | username: ""
47 | # Password to use for PLAIN or SCRAM mechanism
48 | password: ""
49 | # Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI, OAUTHBEARER
50 | mechanism: "PLAIN"
51 | # GSSAPI / Kerberos config properties
52 | gssapi:
53 | authType: ""
54 | keyTabPath: ""
55 | kerberosConfigPath: ""
56 | serviceName: ""
57 | username: ""
58 | password: ""
59 | realm: ""
60 | enableFast: true
61 | # OAUTHBEARER config properties
62 | oauth:
63 | tokenEndpoint: ""
64 | clientId: ""
65 | clientSecret: ""
66 | scope: ""
67 |
68 | minion:
69 | consumerGroups:
70 | # Enabled specifies whether consumer groups shall be scraped and exported or not.
71 | enabled: true
72 | # Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal
73 | # __consumer_offsets topic. Both modes have their advantages and disadvantages.
74 | # * adminApi:
75 | # - Useful for managed kafka clusters that do not provide access to the offsets topic.
76 | # * offsetsTopic
77 | # - Enables kminion_kafka_consumer_group_offset_commits_total metrics.
78 | # - Processing the offsetsTopic requires slightly more memory and cpu than using the adminApi. The amount depends on the
79 | # size and throughput of the offsets topic.
80 | scrapeMode: adminApi # Valid values: adminApi, offsetsTopic
81 | # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
82 | # you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed
83 | # and only topic lags will be exported.
84 | granularity: partition
85 | # AllowedGroups are regex strings of group ids that shall be exported
86 | # You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions
87 | # like "/internal-.*/".
88 | allowedGroups: [ ".*" ]
89 | # IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups
90 | # take precedence over allowed groups.
91 | ignoredGroups: [ ]
92 | topics:
93 | # Enabled can be set to false in order to disable collecting any topic metrics.
94 | enabled: true
95 | # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
96 | # you aren't interested in per partition metrics you could choose "topic".
97 | granularity: partition
98 | # AllowedTopics are regex strings of topic names whose topic metrics that shall be exported.
99 | # You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions
100 | # like "/internal-.*/".
101 | allowedTopics: [ ".*" ]
102 | # IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics
103 | # take precedence over allowed topics.
104 | ignoredTopics: [ ]
105 | # infoMetric is a configuration object for the kminion_kafka_topic_info metric
106 | infoMetric:
107 | # ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric
108 | configKeys: [ "cleanup.policy" ]
109 | logDirs:
110 | # Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior
111 | # to version 1.0.0 as describing log dirs was not supported back then.
112 | enabled: true
113 |
114 | # EndToEnd Metrics
115 | # When enabled, kminion creates a topic which it produces to and consumes from, to measure various advanced metrics. See docs for more info
116 | endToEnd:
117 | enabled: false
118 | # How often to send end-to-end test messages
119 | probeInterval: 100ms
120 | topicManagement:
121 | # You can disable topic management, without disabling the testing feature.
122 | # Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic
123 | enabled: true
124 |
125 | # Name of the topic kminion uses to send its test messages
126 | # You do *not* need to change this if you are running multiple kminion instances on the same cluster.
127 | # Different instances are perfectly fine with sharing the same topic!
128 | name: kminion-end-to-end
129 |
130 | # How often kminion checks its topic to validate configuration, partition count, and partition assignments
131 | reconciliationInterval: 10m
132 |
133 | # Depending on the desired monitoring (e.g. you want to alert on broker failure vs. cluster that is not writable)
134 | # you may choose replication factor 1 or 3 most commonly.
135 | replicationFactor: 1
136 |
137 | # Rarely makes sense to change this, but maybe if you want some sort of cheap load test?
138 | # By default (1) every broker gets one partition
139 | partitionsPerBroker: 1
140 |
141 | producer:
142 | # This defines:
143 | # - Maximum time to wait for an ack response after producing a message
144 | # - Upper bound for histogram buckets in "produce_latency_seconds"
145 | ackSla: 5s
146 | # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if
147 | # the message was written to all in-sync replicas of the partition.
148 | # Or can be set to "leader" to only require to have written the message to its log.
149 | requiredAcks: all
150 |
151 | consumer:
152 | # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically
153 | groupIdPrefix: kminion-end-to-end
154 |
155 | # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want
156 | # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion.
157 | deleteStaleConsumerGroups: false
158 |
159 | # This defines:
160 | # - Upper bound for histogram buckets in "roundtrip_latency"
161 | # - Time limit beyond which a message is considered "lost" (failed the roundtrip)
162 | roundtripSla: 20s
163 |
164 | # - Upper bound for histogram buckets in "commit_latency_seconds"
165 | # - Maximum time an offset commit is allowed to take before considering it failed
166 | commitSla: 10s
167 |
168 | exporter:
169 | # Namespace is the prefix for all exported Prometheus metrics
170 | namespace: "kminion"
171 | # Host that shall be used to bind the HTTP server on
172 | host: ""
173 | # Port that shall be used to bind the HTTP server on
174 | port: 8080
175 |
--------------------------------------------------------------------------------
/docs/screenshots/kminion-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-cluster.png
--------------------------------------------------------------------------------
/docs/screenshots/kminion-groups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-groups.png
--------------------------------------------------------------------------------
/docs/screenshots/kminion-topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-topics.png
--------------------------------------------------------------------------------
/e2e/client_hooks.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "net"
5 | "sync/atomic"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kgo"
9 | "github.com/twmb/franz-go/pkg/kmsg"
10 | "go.uber.org/zap"
11 | )
12 |
13 | // in e2e we only use client hooks for logging connect/disconnect messages
14 | type clientHooks struct {
15 | logger *zap.Logger
16 |
17 | lastCoordinatorUpdate time.Time
18 | currentCoordinator *atomic.Value // kgo.BrokerMetadata
19 | }
20 |
21 | func newEndToEndClientHooks(logger *zap.Logger) *clientHooks {
22 | return &clientHooks{
23 | logger: logger.Named("e2e_hooks"),
24 | currentCoordinator: &atomic.Value{},
25 | }
26 | }
27 |
28 | func (c *clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) {
29 | if err != nil {
30 | c.logger.Error("kafka connection failed", zap.String("broker_host", meta.Host), zap.Int32("broker_id", meta.NodeID), zap.Error(err))
31 | return
32 | }
33 | c.logger.Debug("kafka connection succeeded",
34 | zap.String("host", meta.Host), zap.Int32("broker_id", meta.NodeID),
35 | zap.Int64("dial_duration_ms", dialDur.Milliseconds()))
36 | }
37 |
38 | func (c *clientHooks) OnDisconnect(meta kgo.BrokerMetadata, _ net.Conn) {
39 | c.logger.Warn("kafka broker disconnected", zap.Int32("broker_id", meta.NodeID),
40 | zap.String("host", meta.Host))
41 | }
42 |
43 | // OnBrokerWrite is passed the broker metadata, the key for the request that
44 | // was written, the number of bytes written, how long the request
45 | // waited before being written, how long it took to write the request,
46 | // and any error.
47 | //
48 | // The bytes written does not count any tls overhead.
49 | // OnWrite is called after a write to a broker.
50 | //
51 | // OnWrite(meta BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error)
52 | func (c *clientHooks) OnBrokerWrite(meta kgo.BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) {
53 | keyName := kmsg.NameForKey(key)
54 | if keyName != "OffsetCommit" {
55 | return
56 | }
57 |
58 | // c.logger.Info("hooks onWrite",
59 | // zap.Duration("timeToWrite", timeToWrite),
60 | // zap.NamedError("err", err))
61 | }
62 |
63 | // OnBrokerRead is passed the broker metadata, the key for the response that
64 | // was read, the number of bytes read, how long the Client waited
65 | // before reading the response, how long it took to read the response,
66 | // and any error.
67 | //
68 | // The bytes written does not count any tls overhead.
69 | // OnRead is called after a read from a broker.
70 | // OnRead(meta BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error)
71 | func (c *clientHooks) OnBrokerRead(meta kgo.BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) {
72 | consumerGroupMsgKeys := []int16{
73 | (&kmsg.OffsetCommitResponse{}).Key(),
74 | (&kmsg.JoinGroupResponse{}).Key(),
75 | (&kmsg.HeartbeatResponse{}).Key(),
76 | (&kmsg.SyncGroupResponse{}).Key(),
77 | }
78 |
79 | isMessageFromGroupCoordinator := isInArray(key, consumerGroupMsgKeys)
80 | if !isMessageFromGroupCoordinator {
81 | return
82 | }
83 |
84 | if err == nil {
85 | c.currentCoordinator.Store(meta)
86 | c.lastCoordinatorUpdate = time.Now()
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/e2e/config.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | type Config struct {
9 | Enabled bool `koanf:"enabled"`
10 | TopicManagement EndToEndTopicConfig `koanf:"topicManagement"`
11 | ProbeInterval time.Duration `koanf:"probeInterval"`
12 | Producer EndToEndProducerConfig `koanf:"producer"`
13 | Consumer EndToEndConsumerConfig `koanf:"consumer"`
14 | }
15 |
16 | func (c *Config) SetDefaults() {
17 | c.Enabled = false
18 | c.ProbeInterval = 100 * time.Millisecond
19 | c.TopicManagement.SetDefaults()
20 | c.Producer.SetDefaults()
21 | c.Consumer.SetDefaults()
22 | }
23 |
24 | func (c *Config) Validate() error {
25 |
26 | if !c.Enabled {
27 | return nil
28 | }
29 |
30 | // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0
31 | if c.ProbeInterval == 0 {
32 | return fmt.Errorf("failed to validate probeInterval config, the duration can't be zero")
33 | }
34 |
35 | err := c.TopicManagement.Validate()
36 | if err != nil {
37 | return fmt.Errorf("failed to validate topicManagement config: %w", err)
38 | }
39 |
40 | _, err = time.ParseDuration(c.ProbeInterval.String())
41 | if err != nil {
42 | return fmt.Errorf("failed to parse '%s' to time.Duration: %v", c.ProbeInterval.String(), err)
43 | }
44 |
45 | err = c.Producer.Validate()
46 | if err != nil {
47 | return fmt.Errorf("failed to validate producer config: %w", err)
48 | }
49 |
50 | err = c.Consumer.Validate()
51 | if err != nil {
52 | return fmt.Errorf("failed to validate consumer config: %w", err)
53 | }
54 |
55 | return nil
56 | }
57 |
--------------------------------------------------------------------------------
/e2e/config_consumer.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | type EndToEndConsumerConfig struct {
9 | GroupIdPrefix string `koanf:"groupIdPrefix"`
10 | DeleteStaleConsumerGroups bool `koanf:"deleteStaleConsumerGroups"`
11 |
12 | // RoundtripSLA is the time duration from the moment where we try to produce until the moment where we consumed
13 | // the message. Therefore this should always be higher than the produceTimeout / SLA.
14 | RoundtripSla time.Duration `koanf:"roundtripSla"`
15 | CommitSla time.Duration `koanf:"commitSla"`
16 | }
17 |
18 | func (c *EndToEndConsumerConfig) SetDefaults() {
19 | c.GroupIdPrefix = "kminion-end-to-end"
20 | c.DeleteStaleConsumerGroups = false
21 | c.RoundtripSla = 20 * time.Second
22 | c.CommitSla = 5 * time.Second
23 | }
24 |
25 | func (c *EndToEndConsumerConfig) Validate() error {
26 | if len(c.GroupIdPrefix) < 3 {
27 | return fmt.Errorf("kminion prefix should be at least 3 characters long")
28 | }
29 |
30 | if c.RoundtripSla <= 0 {
31 | return fmt.Errorf("consumer.roundtripSla must be greater than zero")
32 | }
33 |
34 | if c.CommitSla <= 0 {
35 | return fmt.Errorf("consumer.commitSla must be greater than zero")
36 | }
37 |
38 | return nil
39 | }
40 |
--------------------------------------------------------------------------------
/e2e/config_producer.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | type EndToEndProducerConfig struct {
9 | AckSla time.Duration `koanf:"ackSla"`
10 | RequiredAcks string `koanf:"requiredAcks"`
11 | }
12 |
13 | func (c *EndToEndProducerConfig) SetDefaults() {
14 | c.AckSla = 5 * time.Second
15 | c.RequiredAcks = "all"
16 | }
17 |
18 | func (c *EndToEndProducerConfig) Validate() error {
19 |
20 | if c.RequiredAcks != "all" && c.RequiredAcks != "leader" {
21 | return fmt.Errorf("producer.requiredAcks must be 'all' or 'leader")
22 | }
23 |
24 | if c.AckSla <= 0 {
25 | return fmt.Errorf("producer.ackSla must be greater than zero")
26 | }
27 |
28 | return nil
29 | }
30 |
--------------------------------------------------------------------------------
/e2e/config_topic.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | type EndToEndTopicConfig struct {
9 | Enabled bool `koanf:"enabled"`
10 | Name string `koanf:"name"`
11 | ReplicationFactor int `koanf:"replicationFactor"`
12 | PartitionsPerBroker int `koanf:"partitionsPerBroker"`
13 | ReconciliationInterval time.Duration `koanf:"reconciliationInterval"`
14 | }
15 |
16 | func (c *EndToEndTopicConfig) SetDefaults() {
17 | c.Enabled = true
18 | c.Name = "kminion-end-to-end"
19 | c.ReplicationFactor = 1
20 | c.PartitionsPerBroker = 1
21 | c.ReconciliationInterval = 10 * time.Minute
22 | }
23 |
24 | func (c *EndToEndTopicConfig) Validate() error {
25 |
26 | if c.ReplicationFactor < 1 {
27 | return fmt.Errorf("failed to parse replicationFactor, it should be more than 1, retrieved value %v", c.ReplicationFactor)
28 | }
29 |
30 | if c.PartitionsPerBroker < 1 {
31 | return fmt.Errorf("failed to parse partitionsPerBroker, it should be more than 1, retrieved value %v", c.ReplicationFactor)
32 | }
33 |
34 | // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0
35 | if c.ReconciliationInterval == 0 {
36 | return fmt.Errorf("failed to validate topic.ReconciliationInterval config, the duration can't be zero")
37 | }
38 |
39 | return nil
40 | }
41 |
--------------------------------------------------------------------------------
/e2e/consumer.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "strconv"
7 | "time"
8 |
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/zap"
12 | )
13 |
14 | func (s *Service) startConsumeMessages(ctx context.Context, initializedCh chan<- bool) {
15 | client := s.client
16 |
17 | s.logger.Info("starting to consume end-to-end topic",
18 | zap.String("topic_name", s.config.TopicManagement.Name),
19 | zap.String("group_id", s.groupId))
20 |
21 | isInitialized := false
22 | for {
23 | fetches := client.PollFetches(ctx)
24 | if !isInitialized {
25 | isInitialized = true
26 | initializedCh <- true
27 | close(initializedCh)
28 | }
29 |
30 | // Log all errors and continue afterwards as we might get errors and still have some fetch results
31 | errors := fetches.Errors()
32 | for _, err := range errors {
33 | s.logger.Error("kafka fetch error",
34 | zap.String("topic", err.Topic),
35 | zap.Int32("partition", err.Partition),
36 | zap.Error(err.Err))
37 | }
38 |
39 | fetches.EachRecord(s.processMessage)
40 | }
41 | }
42 |
43 | func (s *Service) commitOffsets(ctx context.Context) {
44 | client := s.client
45 | uncommittedOffset := client.UncommittedOffsets()
46 | if uncommittedOffset == nil {
47 | return
48 | }
49 |
50 | startCommitTimestamp := time.Now()
51 |
52 | childCtx, cancel := context.WithTimeout(ctx, s.config.Consumer.CommitSla)
53 | client.CommitOffsets(childCtx, uncommittedOffset, func(_ *kgo.Client, req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) {
54 | cancel()
55 |
56 | coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata)
57 | coordinatorID := strconv.Itoa(int(coordinator.NodeID))
58 |
59 | latency := time.Since(startCommitTimestamp)
60 | s.offsetCommitLatency.WithLabelValues(coordinatorID).Observe(latency.Seconds())
61 | s.offsetCommitsTotal.WithLabelValues(coordinatorID).Inc()
62 | // We do this to ensure that a series with that coordinator id is initialized
63 | s.offsetCommitsTotal.WithLabelValues(coordinatorID).Add(0)
64 |
65 | // If we have at least one error in our commit response we want to report it as an error with an appropriate
66 | // reason as label.
67 | if errCode := s.logCommitErrors(r, err); errCode != "" {
68 | s.offsetCommitsFailedTotal.WithLabelValues(coordinatorID, errCode).Inc()
69 | return
70 | }
71 | })
72 | }
73 |
74 | // processMessage:
75 | // - deserializes the message
76 | // - checks if it is from us, or from another kminion process running somewhere else
77 | // - hands it off to the service, which then reports metrics on it
78 | func (s *Service) processMessage(record *kgo.Record) {
79 | if record.Value == nil {
80 | // Init messages have nil values - we want to skip these. They are only used to make sure a consumer is ready.
81 | return
82 | }
83 |
84 | var msg EndToEndMessage
85 | if jerr := json.Unmarshal(record.Value, &msg); jerr != nil {
86 | s.logger.Error("failed to unmarshal message value", zap.Error(jerr))
87 | return // maybe older version
88 | }
89 |
90 | if msg.MinionID != s.minionID {
91 | return // not from us
92 | }
93 |
94 | // restore partition, which is not serialized
95 | msg.partition = int(record.Partition)
96 | s.messageTracker.onMessageArrived(&msg)
97 | }
98 |
--------------------------------------------------------------------------------
/e2e/endtoend_message.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import "time"
4 |
5 | const (
6 | _ = iota
7 | EndToEndMessageStateCreated
8 | EndToEndMessageStateProducedSuccessfully
9 | )
10 |
11 | type EndToEndMessage struct {
12 | MinionID string `json:"minionID"` // unique for each running kminion instance
13 | MessageID string `json:"messageID"` // unique for each message
14 | Timestamp int64 `json:"createdUtcNs"` // when the message was created, unix nanoseconds
15 |
16 | // The following properties are only used within the message tracker
17 | partition int
18 | state int
19 | produceLatency float64
20 | }
21 |
22 | func (m *EndToEndMessage) creationTime() time.Time {
23 | return time.Unix(0, m.Timestamp)
24 | }
25 |
--------------------------------------------------------------------------------
/e2e/group_tracker.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "context"
5 | "strings"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kerr"
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/zap"
12 | )
13 |
14 | const (
15 | oldGroupCheckInterval = 5 * time.Second // how often to check for old kminion groups
16 | oldGroupMaxAge = 20 * time.Second // maximum age after which an old group should be deleted
17 | )
18 |
19 | // groupTracker keeps checking for empty consumerGroups matching the kminion prefix.
20 | // When a group was seen empty for some time, we delete it.
21 | // Why?
22 | // Whenever a kminion instance starts up it creates a consumer-group for itself in order to not "collide" with other kminion instances.
23 | // When an instance restarts (for whatever reason), it creates a new group again, so we'd end up with a lot of unused groups.
24 | type groupTracker struct {
25 | cfg Config
26 | logger *zap.Logger
27 | client *kgo.Client // kafka client
28 | groupId string // our own groupId
29 | potentiallyEmptyGroups map[string]time.Time // groupName -> utc timestamp when the group was first seen
30 | }
31 |
32 | func newGroupTracker(cfg Config, logger *zap.Logger, client *kgo.Client, groupID string) *groupTracker {
33 | return &groupTracker{
34 | cfg: cfg,
35 | logger: logger.Named("group_tracker"),
36 | client: client,
37 | groupId: groupID,
38 | potentiallyEmptyGroups: make(map[string]time.Time),
39 | }
40 | }
41 |
42 | func (g *groupTracker) start(ctx context.Context) {
43 | g.logger.Debug("starting group tracker")
44 |
45 | deleteOldGroupsTicker := time.NewTicker(oldGroupCheckInterval)
46 | for {
47 | select {
48 | case <-ctx.Done():
49 | g.logger.Debug("stopping group tracker, context was cancelled")
50 | return
51 | case <-deleteOldGroupsTicker.C:
52 | childCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
53 | err := g.checkAndDeleteOldConsumerGroups(childCtx)
54 | if err != nil {
55 | g.logger.Error("failed to check for old consumer groups: %w", zap.Error(err))
56 | }
57 | cancel()
58 | }
59 | }
60 | }
61 |
62 | func (g *groupTracker) checkAndDeleteOldConsumerGroups(ctx context.Context) error {
63 | groupsRq := kmsg.NewListGroupsRequest()
64 | groupsRq.StatesFilter = []string{"Empty"}
65 |
66 | g.logger.Debug("checking for stale kminion consumer groups")
67 |
68 | shardedResponse := g.client.RequestSharded(ctx, &groupsRq)
69 |
70 | // find groups that start with the kminion prefix
71 | matchingGroups := make([]string, 0)
72 | for _, shard := range shardedResponse {
73 | if shard.Err != nil {
74 | g.logger.Error("error in response to ListGroupsRequest", zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err))
75 | continue
76 | }
77 |
78 | r, ok := shard.Resp.(*kmsg.ListGroupsResponse)
79 | if !ok {
80 | g.logger.Error("cannot cast responseShard.Resp to kmsg.ListGroupsResponse")
81 | continue
82 | }
83 |
84 | for _, group := range r.Groups {
85 | name := group.Group
86 |
87 | if name == g.groupId {
88 | continue // skip our own consumer group
89 | }
90 |
91 | if strings.HasPrefix(name, g.cfg.Consumer.GroupIdPrefix) {
92 | matchingGroups = append(matchingGroups, name)
93 | }
94 | }
95 | }
96 |
97 | // save new (previously unseen) groups to tracker
98 | g.logger.Debug("checked for stale consumer groups", zap.Int("found_groups", len(matchingGroups)), zap.Strings("groups", matchingGroups))
99 | for _, name := range matchingGroups {
100 | _, exists := g.potentiallyEmptyGroups[name]
101 | if !exists {
102 | // add it with the current timestamp
103 | g.potentiallyEmptyGroups[name] = time.Now()
104 | g.logger.Debug("found new empty kminion group, adding it to the tracker", zap.String("group", name))
105 | }
106 | }
107 |
108 | // go through saved groups:
109 | // - don't track the ones we don't see anymore (bc they got deleted or are not empty anymore)
110 | // - mark the ones that are too old (have been observed as empty for too long)
111 | groupsToDelete := make([]string, 0)
112 | for name, firstSeen := range g.potentiallyEmptyGroups {
113 | exists, _ := containsStr(matchingGroups, name)
114 | if exists {
115 | // still there, check age and maybe delete it
116 | age := time.Since(firstSeen)
117 | if age > oldGroupMaxAge {
118 | // group was unused for too long, delete it
119 | groupsToDelete = append(groupsToDelete, name)
120 | delete(g.potentiallyEmptyGroups, name)
121 | }
122 | } else {
123 | // does not exist anymore, it must have been deleted, or is in use now (no longer empty)
124 | // don't track it anymore
125 | delete(g.potentiallyEmptyGroups, name)
126 | }
127 | }
128 |
129 | // actually delete the groups we've decided to delete
130 | if len(groupsToDelete) == 0 {
131 | return nil
132 | }
133 |
134 | deleteRq := kmsg.NewDeleteGroupsRequest()
135 | deleteRq.Groups = groupsToDelete
136 | deleteResp := g.client.RequestSharded(ctx, &deleteRq)
137 |
138 | // done, now just errors
139 | // if we get a not authorized error we'll disable deleting groups
140 | foundNotAuthorizedError := false
141 | deletedGroups := make([]string, 0)
142 | for _, shard := range deleteResp {
143 | if shard.Err != nil {
144 | g.logger.Error("sharded consumer group delete request failed", zap.Error(shard.Err))
145 | continue
146 | }
147 |
148 | resp, ok := shard.Resp.(*kmsg.DeleteGroupsResponse)
149 | if !ok {
150 | g.logger.Error("failed to cast shard response to DeleteGroupsResponse while handling an error for deleting groups", zap.String("shard_host", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err))
151 | continue
152 | }
153 |
154 | for _, groupResp := range resp.Groups {
155 | err := kerr.ErrorForCode(groupResp.ErrorCode)
156 | if err != nil {
157 | g.logger.Error("failed to delete consumer group", zap.String("shard", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.String("group", groupResp.Group), zap.Error(err))
158 |
159 | if groupResp.ErrorCode == kerr.GroupAuthorizationFailed.Code {
160 | foundNotAuthorizedError = true
161 | }
162 |
163 | } else {
164 | deletedGroups = append(deletedGroups, groupResp.Group)
165 | }
166 | }
167 | }
168 | g.logger.Info("deleted old consumer groups", zap.Strings("deleted_groups", deletedGroups))
169 |
170 | if foundNotAuthorizedError {
171 | g.logger.Info("disabling trying to delete old kminion consumer-groups since one of the last delete results had an 'GroupAuthorizationFailed' error")
172 | }
173 |
174 | return nil
175 | }
176 |
--------------------------------------------------------------------------------
/e2e/message_tracker.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "fmt"
5 | "strconv"
6 | "time"
7 |
8 | "github.com/jellydator/ttlcache/v2"
9 |
10 | "go.uber.org/zap"
11 | )
12 |
13 | // messageTracker keeps track of the messages' lifetime
14 | //
15 | // When we successfully send a mesasge, it will be added to this tracker.
16 | // Later, when we receive the message back in the consumer, the message is marked as completed and removed from the tracker.
17 | // If the message does not arrive within the configured `consumer.roundtripSla`, it is counted as lost. Messages that
18 | // failed to be produced will not be
19 | // considered as lost message.
20 | //
21 | // We use a dedicated counter to track messages that couldn't be produced to Kafka.
22 | type messageTracker struct {
23 | svc *Service
24 | logger *zap.Logger
25 | cache *ttlcache.Cache
26 | }
27 |
28 | func newMessageTracker(svc *Service) *messageTracker {
29 | defaultExpirationDuration := svc.config.Consumer.RoundtripSla
30 | cache := ttlcache.NewCache()
31 | cache.SetTTL(defaultExpirationDuration)
32 |
33 | t := &messageTracker{
34 | svc: svc,
35 | logger: svc.logger.Named("message_tracker"),
36 | cache: cache,
37 | }
38 | t.cache.SetExpirationReasonCallback(func(key string, reason ttlcache.EvictionReason, value interface{}) {
39 | t.onMessageExpired(key, reason, value.(*EndToEndMessage))
40 | })
41 |
42 | return t
43 | }
44 |
45 | func (t *messageTracker) addToTracker(msg *EndToEndMessage) {
46 | t.cache.Set(msg.MessageID, msg)
47 | }
48 |
49 | // updateItemIfExists only updates a message if it still exists in the cache. The remaining time to live will not
50 | // be refreshed.
51 | // If it doesn't exist an ttlcache.ErrNotFound error will be returned.
52 | func (t *messageTracker) updateItemIfExists(msg *EndToEndMessage) error {
53 | _, ttl, err := t.cache.GetWithTTL(msg.MessageID)
54 | if err != nil {
55 | if err == ttlcache.ErrNotFound {
56 | return err
57 | }
58 | panic(err)
59 | }
60 |
61 | // Because the returned TTL is set to the original TTL duration (and not the remaining TTL) we have to calculate
62 | // the remaining TTL now as we want to updat the existing cache item without changing the remaining time to live.
63 | expiryTimestamp := msg.creationTime().Add(ttl)
64 | remainingTTL := expiryTimestamp.Sub(time.Now())
65 | if remainingTTL < 0 {
66 | // This entry should have been deleted already. Race condition.
67 | return ttlcache.ErrNotFound
68 | }
69 |
70 | err = t.cache.SetWithTTL(msg.MessageID, msg, remainingTTL)
71 | if err != nil {
72 | panic(err)
73 | }
74 |
75 | return nil
76 | }
77 |
78 | // removeFromTracker removes an entry from the cache. If the key does not exist it will return an ttlcache.ErrNotFound error.
79 | func (t *messageTracker) removeFromTracker(messageID string) error {
80 | return t.cache.Remove(messageID)
81 | }
82 |
83 | func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMessage) {
84 | cm, err := t.cache.Get(arrivedMessage.MessageID)
85 | if err != nil {
86 | if err == ttlcache.ErrNotFound {
87 | // message expired and was removed from the cache
88 | // it arrived too late, nothing to do here...
89 | return
90 | } else {
91 | panic(fmt.Errorf("failed to get message from cache: %w", err))
92 | }
93 | }
94 |
95 | msg := cm.(*EndToEndMessage)
96 |
97 | expireTime := msg.creationTime().Add(t.svc.config.Consumer.RoundtripSla)
98 | isExpired := time.Now().Before(expireTime)
99 | latency := time.Now().Sub(msg.creationTime())
100 |
101 | if !isExpired {
102 | // Message arrived late, but was still in cache. We don't increment the lost counter here because eventually
103 | // it will be evicted from the cache. This case should only pop up if the sla time is exceeded, but if the
104 | // item has not been evicted from the cache yet.
105 | t.logger.Info("message arrived late, will be marked as a lost message",
106 | zap.Int64("delay_ms", latency.Milliseconds()),
107 | zap.String("id", msg.MessageID))
108 | return
109 | }
110 |
111 | // message arrived early enough
112 | pID := strconv.Itoa(msg.partition)
113 | t.svc.messagesReceived.WithLabelValues(pID).Inc()
114 | t.svc.roundtripLatency.WithLabelValues(pID).Observe(latency.Seconds())
115 |
116 | // Remove message from cache, so that we don't track it any longer and won't mark it as lost when the entry expires.
117 | t.cache.Remove(msg.MessageID)
118 | }
119 |
120 | func (t *messageTracker) onMessageExpired(_ string, reason ttlcache.EvictionReason, value interface{}) {
121 | if reason == ttlcache.Removed {
122 | // We are not interested in messages that have been removed by us!
123 | return
124 | }
125 |
126 | msg := value.(*EndToEndMessage)
127 |
128 | created := msg.creationTime()
129 | age := time.Since(created)
130 | t.svc.lostMessages.WithLabelValues(strconv.Itoa(msg.partition)).Inc()
131 |
132 | t.logger.Debug("message expired/lost",
133 | zap.Int64("age_ms", age.Milliseconds()),
134 | zap.Int("partition", msg.partition),
135 | zap.String("message_id", msg.MessageID),
136 | zap.Bool("successfully_produced", msg.state == EndToEndMessageStateProducedSuccessfully),
137 | zap.Float64("produce_latency_seconds", msg.produceLatency),
138 | )
139 | }
140 |
--------------------------------------------------------------------------------
/e2e/producer.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "strconv"
7 | "time"
8 |
9 | "github.com/google/uuid"
10 | "github.com/twmb/franz-go/pkg/kgo"
11 | "go.uber.org/zap"
12 | )
13 |
14 | // produceMessagesToAllPartitions sends an EndToEndMessage to every partition on the given topic
15 | func (s *Service) produceMessagesToAllPartitions(ctx context.Context) {
16 | for i := 0; i < s.partitionCount; i++ {
17 | s.produceMessage(ctx, i)
18 | }
19 | }
20 |
21 | // produceMessage produces an end to end record to a single given partition. If it succeeds producing the record
22 | // it will add it to the message tracker. If producing fails a message will be logged and the respective metrics
23 | // will be incremented.
24 | func (s *Service) produceMessage(ctx context.Context, partition int) {
25 | topicName := s.config.TopicManagement.Name
26 | record, msg := createEndToEndRecord(s.minionID, topicName, partition)
27 |
28 | startTime := time.Now()
29 |
30 | // This childCtx will ensure that we will abort our efforts to produce (including retries) when we exceed
31 | // the SLA for producers.
32 | childCtx, cancel := context.WithTimeout(ctx, s.config.Producer.AckSla+2*time.Second)
33 |
34 | pID := strconv.Itoa(partition)
35 | s.messagesProducedInFlight.WithLabelValues(pID).Inc()
36 | s.messageTracker.addToTracker(msg)
37 | s.client.TryProduce(childCtx, record, func(r *kgo.Record, err error) {
38 | defer cancel()
39 | ackDuration := time.Since(startTime)
40 | s.messagesProducedInFlight.WithLabelValues(pID).Dec()
41 | s.messagesProducedTotal.WithLabelValues(pID).Inc()
42 | // We add 0 in order to ensure that the "failed" metric series for that partition id are initialized as well.
43 | s.messagesProducedFailed.WithLabelValues(pID).Add(0)
44 | s.lostMessages.WithLabelValues(pID).Add(0)
45 |
46 | if err != nil {
47 | s.messagesProducedFailed.WithLabelValues(pID).Inc()
48 | _ = s.messageTracker.removeFromTracker(msg.MessageID)
49 |
50 | s.logger.Info("failed to produce message to end-to-end topic",
51 | zap.String("topic_name", r.Topic),
52 | zap.Int32("partition", r.Partition),
53 | zap.Error(err))
54 | return
55 | } else {
56 | // Update the message's state. If this message expires and is marked as successfully produced we will
57 | // report this as a lost message, which would indicate that the producer was told that the message got
58 | // produced successfully, but it got lost somewhere.
59 | // We need to use updateItemIfExists() because it's possible that the message has already been consumed
60 | // before we have received the message here (because we were awaiting the produce ack).
61 | msg.state = EndToEndMessageStateProducedSuccessfully
62 | msg.produceLatency = ackDuration.Seconds()
63 |
64 | // TODO: Enable again as soon as https://github.com/ReneKroon/ttlcache/issues/60 is fixed
65 | // Because we cannot update cache items in an atomic fashion we currently can't use this method
66 | // as this would cause a race condition which ends up in records being reported as lost/expired.
67 | // s.messageTracker.updateItemIfExists(msg)
68 | }
69 |
70 | s.produceLatency.WithLabelValues(pID).Observe(ackDuration.Seconds())
71 | })
72 | }
73 |
74 | func createEndToEndRecord(minionID string, topicName string, partition int) (*kgo.Record, *EndToEndMessage) {
75 | message := &EndToEndMessage{
76 | MinionID: minionID,
77 | MessageID: uuid.NewString(),
78 | Timestamp: time.Now().UnixNano(),
79 |
80 | partition: partition,
81 | state: EndToEndMessageStateCreated,
82 | }
83 |
84 | mjson, err := json.Marshal(message)
85 | if err != nil {
86 | // Should never happen since the struct is so simple,
87 | // but if it does, something is completely broken anyway
88 | panic("cannot serialize EndToEndMessage")
89 | }
90 |
91 | record := &kgo.Record{
92 | Topic: topicName,
93 | Value: mjson,
94 | Partition: int32(partition), // we set partition for producing so our customPartitioner can make use of it
95 | }
96 |
97 | return record, message
98 | }
99 |
--------------------------------------------------------------------------------
/e2e/topic_test.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "github.com/stretchr/testify/assert"
5 | "github.com/twmb/franz-go/pkg/kmsg"
6 | "sort"
7 | "testing"
8 | )
9 |
10 | func TestCalculateAppropriateReplicas(t *testing.T) {
11 | tt := []struct {
12 | TestName string
13 | Brokers []kmsg.MetadataResponseBroker
14 | ReplicationFactor int
15 | LeaderBroker kmsg.MetadataResponseBroker
16 |
17 | // Some cases may have more than one possible solution, each entry in the outer array covers one allowed
18 | // solution. The compared int32 array order does not matter, except for the very first item as this indicates
19 | // the preferred leader. For example if you use {2, 0, 1} as expected result this would also be valid for
20 | // the actual result {2, 1, 0} but not for {1, 2, 0} - because '2' must be the first int32.
21 | ExpectedResults [][]int32
22 | }{
23 | {
24 | TestName: "3 Brokers, no rack, RF = 3",
25 | Brokers: []kmsg.MetadataResponseBroker{
26 | {NodeID: 0, Rack: nil},
27 | {NodeID: 1, Rack: nil},
28 | {NodeID: 2, Rack: nil},
29 | },
30 | ReplicationFactor: 3,
31 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 2, Rack: nil},
32 | ExpectedResults: [][]int32{{2, 0, 1}},
33 | },
34 |
35 | {
36 | TestName: "3 Brokers, 3 racks, RF = 3",
37 | Brokers: []kmsg.MetadataResponseBroker{
38 | {NodeID: 0, Rack: kmsg.StringPtr("a")},
39 | {NodeID: 1, Rack: kmsg.StringPtr("b")},
40 | {NodeID: 2, Rack: kmsg.StringPtr("c")},
41 | },
42 | ReplicationFactor: 3,
43 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 2, Rack: kmsg.StringPtr("c")},
44 | ExpectedResults: [][]int32{{2, 0, 1}},
45 | },
46 |
47 | {
48 | TestName: "3 Brokers, 3 racks, RF = 1",
49 | Brokers: []kmsg.MetadataResponseBroker{
50 | {NodeID: 0, Rack: kmsg.StringPtr("a")},
51 | {NodeID: 1, Rack: kmsg.StringPtr("b")},
52 | {NodeID: 2, Rack: kmsg.StringPtr("c")},
53 | },
54 | ReplicationFactor: 1,
55 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 1, Rack: kmsg.StringPtr("b")},
56 | ExpectedResults: [][]int32{{1}},
57 | },
58 |
59 | {
60 | TestName: "3 Brokers, 3 racks, RF = 2",
61 | Brokers: []kmsg.MetadataResponseBroker{
62 | {NodeID: 0, Rack: kmsg.StringPtr("a")},
63 | {NodeID: 1, Rack: kmsg.StringPtr("b")},
64 | {NodeID: 2, Rack: kmsg.StringPtr("c")},
65 | },
66 | ReplicationFactor: 2,
67 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 1, Rack: kmsg.StringPtr("b")},
68 | ExpectedResults: [][]int32{{1, 0}, {1, 2}},
69 | },
70 |
71 | {
72 | TestName: "6 Brokers, 3 racks, RF = 3",
73 | Brokers: []kmsg.MetadataResponseBroker{
74 | {NodeID: 0, Rack: kmsg.StringPtr("a")},
75 | {NodeID: 1, Rack: kmsg.StringPtr("b")},
76 | {NodeID: 2, Rack: kmsg.StringPtr("c")},
77 | {NodeID: 3, Rack: kmsg.StringPtr("a")},
78 | {NodeID: 4, Rack: kmsg.StringPtr("b")},
79 | {NodeID: 5, Rack: kmsg.StringPtr("c")},
80 | },
81 | ReplicationFactor: 3,
82 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 4, Rack: kmsg.StringPtr("b")},
83 | ExpectedResults: [][]int32{{4, 0, 2}, {4, 0, 5}, {4, 3, 2}, {4, 3, 5}},
84 | },
85 |
86 | {
87 | TestName: "4 Brokers, 2 racks, RF = 3",
88 | Brokers: []kmsg.MetadataResponseBroker{
89 | {NodeID: 0, Rack: kmsg.StringPtr("a")},
90 | {NodeID: 1, Rack: kmsg.StringPtr("b")},
91 | {NodeID: 2, Rack: kmsg.StringPtr("a")},
92 | {NodeID: 3, Rack: kmsg.StringPtr("b")},
93 | },
94 | ReplicationFactor: 3,
95 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 0, Rack: kmsg.StringPtr("a")},
96 | ExpectedResults: [][]int32{{0, 1, 2}, {0, 1, 3}, {0, 2, 3}},
97 | },
98 |
99 | {
100 | TestName: "6 Brokers, 3 racks, RF = 3, lowest node id != 0",
101 | Brokers: []kmsg.MetadataResponseBroker{
102 | {NodeID: 10, Rack: kmsg.StringPtr("a")},
103 | {NodeID: 11, Rack: kmsg.StringPtr("b")},
104 | {NodeID: 12, Rack: kmsg.StringPtr("c")},
105 | {NodeID: 13, Rack: kmsg.StringPtr("a")},
106 | {NodeID: 14, Rack: kmsg.StringPtr("b")},
107 | {NodeID: 15, Rack: kmsg.StringPtr("c")},
108 | },
109 | ReplicationFactor: 3,
110 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 11, Rack: kmsg.StringPtr("b")},
111 | ExpectedResults: [][]int32{{11, 10, 12}, {11, 12, 13}, {11, 13, 15}},
112 | },
113 |
114 | {
115 | TestName: "6 Brokers, 3 racks, RF = 5, lowest node id != 0",
116 | Brokers: []kmsg.MetadataResponseBroker{
117 | {NodeID: 10, Rack: kmsg.StringPtr("a")},
118 | {NodeID: 11, Rack: kmsg.StringPtr("b")},
119 | {NodeID: 12, Rack: kmsg.StringPtr("c")},
120 | {NodeID: 13, Rack: kmsg.StringPtr("a")},
121 | {NodeID: 14, Rack: kmsg.StringPtr("b")},
122 | {NodeID: 15, Rack: kmsg.StringPtr("c")},
123 | },
124 | ReplicationFactor: 5,
125 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 11, Rack: kmsg.StringPtr("b")},
126 | ExpectedResults: [][]int32{{11, 10, 12, 13, 14}, {11, 10, 13, 14, 15}, {11, 12, 13, 14, 15}, {11, 10, 12, 13, 15}, {11, 10, 12, 14, 15}},
127 | },
128 | }
129 |
130 | svc := Service{}
131 | for _, test := range tt {
132 | meta := kmsg.NewMetadataResponse()
133 | meta.Brokers = test.Brokers
134 | replicaIDs := svc.calculateAppropriateReplicas(&meta, test.ReplicationFactor, test.LeaderBroker)
135 |
136 | matchesAtLeastOneExpectedResult := false
137 | for _, possibleResult := range test.ExpectedResults {
138 | isValidResult := possibleResult[0] == replicaIDs[0] && doElementsMatch(possibleResult, replicaIDs)
139 | if isValidResult {
140 | matchesAtLeastOneExpectedResult = true
141 | break
142 | }
143 | }
144 | if !matchesAtLeastOneExpectedResult {
145 | // Use first elementsmatch to print some valid result along with the actual results.
146 | assert.ElementsMatch(t, test.ExpectedResults[0], replicaIDs, test.TestName)
147 | }
148 | }
149 | }
150 |
151 | func doElementsMatch(a, b []int32) bool {
152 | if len(a) != len(b) {
153 | return false
154 | }
155 |
156 | sort.Slice(a, func(i, j int) bool { return a[i] < a[j] })
157 | sort.Slice(b, func(i, j int) bool { return a[i] < a[j] })
158 | for i, num := range a {
159 | if num != b[i] {
160 | return false
161 | }
162 | }
163 |
164 | return true
165 | }
166 |
--------------------------------------------------------------------------------
/e2e/utils.go:
--------------------------------------------------------------------------------
1 | package e2e
2 |
3 | import (
4 | "context"
5 | "math"
6 | "time"
7 |
8 | "github.com/prometheus/client_golang/prometheus"
9 | "github.com/twmb/franz-go/pkg/kerr"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/zap"
12 | )
13 |
14 | // createHistogramBuckets creates the buckets for the histogram based on the number of desired buckets (10) and the
15 | // upper bucket size.
16 | func createHistogramBuckets(maxLatency time.Duration) []float64 {
17 | // Since this is an exponential bucket we need to take Log base2 or binary as the upper bound
18 | // Divide by 10 for the argument because the base is counted as 20ms and we want to normalize it as base 2 instead of 20
19 | // +2 because it starts at 5ms or 0.005 sec, to account 5ms and 10ms before it goes to the base which in this case is 0.02 sec or 20ms
20 | // and another +1 to account for decimal points on int parsing
21 | latencyCount := math.Logb(float64(maxLatency.Milliseconds() / 10))
22 | count := int(latencyCount) + 3
23 | bucket := prometheus.ExponentialBuckets(0.005, 2, count)
24 |
25 | return bucket
26 | }
27 |
28 | func containsStr(ar []string, x string) (bool, int) {
29 | for i, item := range ar {
30 | if item == x {
31 | return true, i
32 | }
33 | }
34 | return false, -1
35 | }
36 |
37 | // logCommitErrors logs all errors in commit response and returns a well formatted error code if there was one
38 | func (s *Service) logCommitErrors(r *kmsg.OffsetCommitResponse, err error) string {
39 | if err != nil {
40 | if err == context.DeadlineExceeded {
41 | s.logger.Warn("offset commit failed because SLA has been exceeded")
42 | return "OFFSET_COMMIT_SLA_EXCEEDED"
43 | }
44 |
45 | s.logger.Warn("offset commit failed", zap.Error(err))
46 | return "RESPONSE_ERROR"
47 | }
48 |
49 | lastErrCode := ""
50 | for _, t := range r.Topics {
51 | for _, p := range t.Partitions {
52 | typedErr := kerr.TypedErrorForCode(p.ErrorCode)
53 | if typedErr == nil {
54 | continue
55 | }
56 |
57 | s.logger.Warn("error committing partition offset",
58 | zap.String("topic", t.Topic),
59 | zap.Int32("partition_id", p.Partition),
60 | zap.Error(typedErr),
61 | )
62 | lastErrCode = typedErr.Message
63 | }
64 | }
65 |
66 | return lastErrCode
67 | }
68 |
69 | // brokerMetadataByBrokerID returns a map of all broker metadata keyed by their BrokerID
70 | func brokerMetadataByBrokerID(meta []kmsg.MetadataResponseBroker) map[int32]kmsg.MetadataResponseBroker {
71 | res := make(map[int32]kmsg.MetadataResponseBroker)
72 | for _, broker := range meta {
73 | res[broker.NodeID] = broker
74 | }
75 | return res
76 | }
77 |
78 | // brokerMetadataByRackID returns a map of all broker metadata keyed by their Rack identifier
79 | func brokerMetadataByRackID(meta []kmsg.MetadataResponseBroker) map[string][]kmsg.MetadataResponseBroker {
80 | res := make(map[string][]kmsg.MetadataResponseBroker)
81 | for _, broker := range meta {
82 | rackID := ""
83 | if broker.Rack != nil {
84 | rackID = *broker.Rack
85 | }
86 | res[rackID] = append(res[rackID], broker)
87 | }
88 | return res
89 | }
90 |
91 | func pointerStrToStr(str *string) string {
92 | if str == nil {
93 | return ""
94 | }
95 | return *str
96 | }
97 |
98 | func safeUnwrap(err error) string {
99 | if err == nil {
100 | return ""
101 | }
102 | return err.Error()
103 | }
104 |
105 | func isInArray(num int16, arr []int16) bool {
106 | for _, n := range arr {
107 | if num == n {
108 | return true
109 | }
110 | }
111 | return false
112 | }
113 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/cloudhut/kminion/v2
2 |
3 | go 1.24
4 |
5 | require (
6 | github.com/google/uuid v1.6.0
7 | github.com/jcmturner/gokrb5/v8 v8.4.4
8 | github.com/jellydator/ttlcache/v2 v2.11.1
9 | github.com/knadh/koanf v1.5.0
10 | github.com/mitchellh/mapstructure v1.5.0
11 | github.com/orcaman/concurrent-map v1.0.0
12 | github.com/pkg/errors v0.9.1
13 | github.com/prometheus/client_golang v1.20.5
14 | github.com/stretchr/testify v1.9.0
15 | github.com/twmb/franz-go v1.18.0
16 | github.com/twmb/franz-go/pkg/kadm v1.14.0
17 | github.com/twmb/franz-go/pkg/kmsg v1.9.0
18 | github.com/twmb/franz-go/pkg/sasl/kerberos v1.1.0
19 | go.uber.org/atomic v1.11.0
20 | go.uber.org/zap v1.27.0
21 | golang.org/x/sync v0.8.0
22 | )
23 |
24 | require (
25 | github.com/beorn7/perks v1.0.1 // indirect
26 | github.com/cespare/xxhash/v2 v2.3.0 // indirect
27 | github.com/davecgh/go-spew v1.1.1 // indirect
28 | github.com/fsnotify/fsnotify v1.8.0 // indirect
29 | github.com/hashicorp/go-uuid v1.0.3 // indirect
30 | github.com/jcmturner/aescts/v2 v2.0.0 // indirect
31 | github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect
32 | github.com/jcmturner/gofork v1.7.6 // indirect
33 | github.com/jcmturner/rpc/v2 v2.0.3 // indirect
34 | github.com/klauspost/compress v1.17.11 // indirect
35 | github.com/mitchellh/copystructure v1.2.0 // indirect
36 | github.com/mitchellh/reflectwalk v1.0.2 // indirect
37 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
38 | github.com/pelletier/go-toml v1.9.1 // indirect
39 | github.com/pierrec/lz4/v4 v4.1.21 // indirect
40 | github.com/pmezard/go-difflib v1.0.0 // indirect
41 | github.com/prometheus/client_model v0.6.1 // indirect
42 | github.com/prometheus/common v0.60.1 // indirect
43 | github.com/prometheus/procfs v0.15.1 // indirect
44 | go.uber.org/multierr v1.11.0 // indirect
45 | golang.org/x/crypto v0.36.0 // indirect
46 | golang.org/x/net v0.37.0 // indirect
47 | golang.org/x/sys v0.31.0 // indirect
48 | google.golang.org/protobuf v1.35.1 // indirect
49 | gopkg.in/yaml.v3 v3.0.1 // indirect
50 | )
51 |
--------------------------------------------------------------------------------
/kafka/client_config_helper.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import (
4 | "context"
5 | "crypto/tls"
6 | "crypto/x509"
7 | "encoding/pem"
8 | "fmt"
9 | "io/ioutil"
10 | "net"
11 | "time"
12 |
13 | "github.com/jcmturner/gokrb5/v8/client"
14 | "github.com/jcmturner/gokrb5/v8/keytab"
15 | "github.com/twmb/franz-go/pkg/kgo"
16 | "github.com/twmb/franz-go/pkg/kversion"
17 | "github.com/twmb/franz-go/pkg/sasl"
18 | "github.com/twmb/franz-go/pkg/sasl/kerberos"
19 | "github.com/twmb/franz-go/pkg/sasl/oauth"
20 | "github.com/twmb/franz-go/pkg/sasl/plain"
21 | "github.com/twmb/franz-go/pkg/sasl/scram"
22 | "go.uber.org/zap"
23 |
24 | krbconfig "github.com/jcmturner/gokrb5/v8/config"
25 | )
26 |
27 | // NewKgoConfig creates a new Config for the Kafka Client as exposed by the franz-go library.
28 | // If TLS certificates can't be read an error will be returned.
29 | // logger is only used to print warnings about TLS.
30 | func NewKgoConfig(cfg Config, logger *zap.Logger) ([]kgo.Opt, error) {
31 | opts := []kgo.Opt{
32 | kgo.SeedBrokers(cfg.Brokers...),
33 | kgo.MaxVersions(kversion.V2_7_0()),
34 | kgo.ClientID(cfg.ClientID),
35 | kgo.FetchMaxBytes(5 * 1000 * 1000), // 5MB
36 | kgo.MaxConcurrentFetches(10),
37 | // Allow metadata to be refreshed more often than 5s (default) if needed.
38 | // That will mitigate issues with unknown partitions shortly after creating
39 | // them.
40 | kgo.MetadataMinAge(time.Second),
41 | }
42 |
43 | // Create Logger
44 | kgoLogger := KgoZapLogger{
45 | logger: logger.Sugar(),
46 | }
47 | opts = append(opts, kgo.WithLogger(kgoLogger))
48 |
49 | // Add Rack Awareness if configured
50 | if cfg.RackID != "" {
51 | opts = append(opts, kgo.Rack(cfg.RackID))
52 | }
53 |
54 | // Configure SASL
55 | if cfg.SASL.Enabled {
56 | // SASL Plain
57 | if cfg.SASL.Mechanism == "PLAIN" {
58 | mechanism := plain.Auth{
59 | User: cfg.SASL.Username,
60 | Pass: cfg.SASL.Password,
61 | }.AsMechanism()
62 | opts = append(opts, kgo.SASL(mechanism))
63 | }
64 |
65 | // SASL SCRAM
66 | if cfg.SASL.Mechanism == "SCRAM-SHA-256" || cfg.SASL.Mechanism == "SCRAM-SHA-512" {
67 | var mechanism sasl.Mechanism
68 | scramAuth := scram.Auth{
69 | User: cfg.SASL.Username,
70 | Pass: cfg.SASL.Password,
71 | }
72 | if cfg.SASL.Mechanism == "SCRAM-SHA-256" {
73 | mechanism = scramAuth.AsSha256Mechanism()
74 | }
75 | if cfg.SASL.Mechanism == "SCRAM-SHA-512" {
76 | mechanism = scramAuth.AsSha512Mechanism()
77 | }
78 | opts = append(opts, kgo.SASL(mechanism))
79 | }
80 |
81 | // Kerberos
82 | if cfg.SASL.Mechanism == "GSSAPI" {
83 | var krbClient *client.Client
84 |
85 | kerbCfg, err := krbconfig.Load(cfg.SASL.GSSAPI.KerberosConfigPath)
86 | if err != nil {
87 | return nil, fmt.Errorf("failed to create kerberos config from specified config filepath: %w", err)
88 | }
89 |
90 | switch cfg.SASL.GSSAPI.AuthType {
91 | case "USER_AUTH:":
92 | krbClient = client.NewWithPassword(
93 | cfg.SASL.GSSAPI.Username,
94 | cfg.SASL.GSSAPI.Realm,
95 | cfg.SASL.GSSAPI.Password,
96 | kerbCfg,
97 | client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast))
98 | case "KEYTAB_AUTH":
99 | ktb, err := keytab.Load(cfg.SASL.GSSAPI.KeyTabPath)
100 | if err != nil {
101 | return nil, fmt.Errorf("failed to load keytab: %w", err)
102 | }
103 | krbClient = client.NewWithKeytab(
104 | cfg.SASL.GSSAPI.Username,
105 | cfg.SASL.GSSAPI.Realm,
106 | ktb,
107 | kerbCfg,
108 | client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast))
109 | }
110 | kerberosMechanism := kerberos.Auth{
111 | Client: krbClient,
112 | Service: cfg.SASL.GSSAPI.ServiceName,
113 | PersistAfterAuth: true,
114 | }.AsMechanism()
115 | opts = append(opts, kgo.SASL(kerberosMechanism))
116 | }
117 |
118 | // OAuthBearer
119 | if cfg.SASL.Mechanism == "OAUTHBEARER" {
120 | mechanism := oauth.Oauth(func(ctx context.Context) (oauth.Auth, error) {
121 | token, err := cfg.SASL.OAuthBearer.getToken(ctx)
122 | return oauth.Auth{
123 | Zid: cfg.SASL.OAuthBearer.ClientID,
124 | Token: token,
125 | }, err
126 | })
127 | opts = append(opts, kgo.SASL(mechanism))
128 | }
129 | }
130 |
131 | // Configure TLS
132 | var caCertPool *x509.CertPool
133 | if cfg.TLS.Enabled {
134 | // Root CA
135 | if cfg.TLS.CaFilepath != "" || len(cfg.TLS.Ca) > 0 {
136 | ca := []byte(cfg.TLS.Ca)
137 | if cfg.TLS.CaFilepath != "" {
138 | caBytes, err := ioutil.ReadFile(cfg.TLS.CaFilepath)
139 | if err != nil {
140 | return nil, fmt.Errorf("failed to load ca cert: %w", err)
141 | }
142 | ca = caBytes
143 | }
144 | caCertPool = x509.NewCertPool()
145 | isSuccessful := caCertPool.AppendCertsFromPEM(ca)
146 | if !isSuccessful {
147 | logger.Warn("failed to append ca file to cert pool, is this a valid PEM format?")
148 | }
149 | }
150 |
151 | // If configured load TLS cert & key - Mutual TLS
152 | var certificates []tls.Certificate
153 | hasCertFile := cfg.TLS.CertFilepath != "" || len(cfg.TLS.Cert) > 0
154 | hasKeyFile := cfg.TLS.KeyFilepath != "" || len(cfg.TLS.Key) > 0
155 | if hasCertFile || hasKeyFile {
156 | cert := []byte(cfg.TLS.Cert)
157 | privateKey := []byte(cfg.TLS.Key)
158 | // 1. Read certificates
159 | if cfg.TLS.CertFilepath != "" {
160 | certBytes, err := ioutil.ReadFile(cfg.TLS.CertFilepath)
161 | if err != nil {
162 | return nil, fmt.Errorf("failed to TLS certificate: %w", err)
163 | }
164 | cert = certBytes
165 | }
166 |
167 | if cfg.TLS.KeyFilepath != "" {
168 | keyBytes, err := ioutil.ReadFile(cfg.TLS.KeyFilepath)
169 | if err != nil {
170 | return nil, fmt.Errorf("failed to read TLS key: %w", err)
171 | }
172 | privateKey = keyBytes
173 | }
174 |
175 | // 2. Check if private key needs to be decrypted. Decrypt it if passphrase is given, otherwise return error
176 | pemBlock, _ := pem.Decode(privateKey)
177 | if pemBlock == nil {
178 | return nil, fmt.Errorf("no valid private key found")
179 | }
180 |
181 | if x509.IsEncryptedPEMBlock(pemBlock) {
182 | decryptedKey, err := x509.DecryptPEMBlock(pemBlock, []byte(cfg.TLS.Passphrase))
183 | if err != nil {
184 | return nil, fmt.Errorf("private key is encrypted, but could not decrypt it: %s", err)
185 | }
186 | // If private key was encrypted we can overwrite the original contents now with the decrypted version
187 | privateKey = pem.EncodeToMemory(&pem.Block{Type: pemBlock.Type, Bytes: decryptedKey})
188 | }
189 | tlsCert, err := tls.X509KeyPair(cert, privateKey)
190 | if err != nil {
191 | return nil, fmt.Errorf("cannot parse pem: %s", err)
192 | }
193 | certificates = []tls.Certificate{tlsCert}
194 | }
195 |
196 | tlsDialer := &tls.Dialer{
197 | NetDialer: &net.Dialer{Timeout: 10 * time.Second},
198 | Config: &tls.Config{
199 | InsecureSkipVerify: cfg.TLS.InsecureSkipTLSVerify,
200 | Certificates: certificates,
201 | RootCAs: caCertPool,
202 | },
203 | }
204 | opts = append(opts, kgo.Dialer(tlsDialer.DialContext))
205 | }
206 |
207 | return opts, nil
208 | }
209 |
--------------------------------------------------------------------------------
/kafka/client_logger.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import (
4 | "github.com/twmb/franz-go/pkg/kgo"
5 | "go.uber.org/zap"
6 | )
7 |
8 | type KgoZapLogger struct {
9 | logger *zap.SugaredLogger
10 | }
11 |
12 | // Level Implements kgo.Logger interface. It returns the log level to log at.
13 | // We pin this to debug as the zap logger decides what to actually send to the output stream.
14 | func (k KgoZapLogger) Level() kgo.LogLevel {
15 | return kgo.LogLevelDebug
16 | }
17 |
18 | // Log implements kgo.Logger interface
19 | func (k KgoZapLogger) Log(level kgo.LogLevel, msg string, keyvals ...interface{}) {
20 | switch level {
21 | case kgo.LogLevelDebug:
22 | k.logger.Debugw(msg, keyvals...)
23 | case kgo.LogLevelInfo:
24 | k.logger.Infow(msg, keyvals...)
25 | case kgo.LogLevelWarn:
26 | k.logger.Warnw(msg, keyvals...)
27 | case kgo.LogLevelError:
28 | k.logger.Errorw(msg, keyvals...)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/kafka/config.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import "fmt"
4 |
5 | type Config struct {
6 | // General
7 | Brokers []string `koanf:"brokers"`
8 | ClientID string `koanf:"clientId"`
9 | RackID string `koanf:"rackId"`
10 |
11 | TLS TLSConfig `koanf:"tls"`
12 | SASL SASLConfig `koanf:"sasl"`
13 |
14 | RetryInitConnection bool `koanf:"retryInitConnection"`
15 | }
16 |
17 | func (c *Config) SetDefaults() {
18 | c.ClientID = "kminion"
19 |
20 | c.TLS.SetDefaults()
21 | c.SASL.SetDefaults()
22 | }
23 |
24 | func (c *Config) Validate() error {
25 | if len(c.Brokers) == 0 {
26 | return fmt.Errorf("no seed brokers specified, at least one must be configured")
27 | }
28 |
29 | err := c.TLS.Validate()
30 | if err != nil {
31 | return fmt.Errorf("failed to validate TLS config: %w", err)
32 | }
33 |
34 | err = c.SASL.Validate()
35 | if err != nil {
36 | return fmt.Errorf("failed to validate SASL config: %w", err)
37 | }
38 |
39 | return nil
40 | }
41 |
--------------------------------------------------------------------------------
/kafka/config_sasl.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import "fmt"
4 |
5 | const (
6 | SASLMechanismPlain = "PLAIN"
7 | SASLMechanismScramSHA256 = "SCRAM-SHA-256"
8 | SASLMechanismScramSHA512 = "SCRAM-SHA-512"
9 | SASLMechanismGSSAPI = "GSSAPI"
10 | SASLMechanismOAuthBearer = "OAUTHBEARER"
11 | )
12 |
13 | // SASLConfig for Kafka Client
14 | type SASLConfig struct {
15 | Enabled bool `koanf:"enabled"`
16 | Username string `koanf:"username"`
17 | Password string `koanf:"password"`
18 | Mechanism string `koanf:"mechanism"`
19 |
20 | // SASL Mechanisms that require more configuration than username & password
21 | GSSAPI SASLGSSAPIConfig `koanf:"gssapi"`
22 | OAuthBearer OAuthBearerConfig `koanf:"oauth"`
23 | }
24 |
25 | // SetDefaults for SASL Config
26 | func (c *SASLConfig) SetDefaults() {
27 | c.Enabled = false
28 | c.Mechanism = SASLMechanismPlain
29 | c.GSSAPI.SetDefaults()
30 | }
31 |
32 | // Validate SASL config input
33 | func (c *SASLConfig) Validate() error {
34 | if !c.Enabled {
35 | return nil
36 | }
37 |
38 | switch c.Mechanism {
39 | case SASLMechanismPlain, SASLMechanismScramSHA256, SASLMechanismScramSHA512, SASLMechanismGSSAPI:
40 | // Valid and supported
41 | case SASLMechanismOAuthBearer:
42 | return c.OAuthBearer.Validate()
43 | default:
44 | return fmt.Errorf("given sasl mechanism '%v' is invalid", c.Mechanism)
45 | }
46 |
47 | return nil
48 | }
49 |
--------------------------------------------------------------------------------
/kafka/config_sasl_gssapi.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | // SASLGSSAPIConfig represents the Kafka Kerberos config
4 | type SASLGSSAPIConfig struct {
5 | AuthType string `koanf:"authType"`
6 | KeyTabPath string `koanf:"keyTabPath"`
7 | KerberosConfigPath string `koanf:"kerberosConfigPath"`
8 | ServiceName string `koanf:"serviceName"`
9 | Username string `koanf:"username"`
10 | Password string `koanf:"password"`
11 | Realm string `koanf:"realm"`
12 |
13 | // EnableFAST enables FAST, which is a pre-authentication framework for Kerberos.
14 | // It includes a mechanism for tunneling pre-authentication exchanges using armoured KDC messages.
15 | // FAST provides increased resistance to passive password guessing attacks.
16 | EnableFast bool `koanf:"enableFast"`
17 | }
18 |
19 | func (s *SASLGSSAPIConfig) SetDefaults() {
20 | s.EnableFast = true
21 | }
22 |
--------------------------------------------------------------------------------
/kafka/config_sasl_oauthbearer.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import (
4 | "context"
5 | "encoding/base64"
6 | "encoding/json"
7 | "fmt"
8 | "net/http"
9 | "net/url"
10 | "strings"
11 | )
12 |
13 | type OAuthBearerConfig struct {
14 | TokenEndpoint string `koanf:"tokenEndpoint"`
15 | ClientID string `koanf:"clientId"`
16 | ClientSecret string `koanf:"clientSecret"`
17 | Scope string `koanf:"scope"`
18 | }
19 |
20 | func (c *OAuthBearerConfig) Validate() error {
21 | if c.TokenEndpoint == "" {
22 | return fmt.Errorf("OAuthBearer token endpoint is not specified")
23 | }
24 | if c.ClientID == "" || c.ClientSecret == "" {
25 | return fmt.Errorf("OAuthBearer client credentials are not specified")
26 | }
27 | return nil
28 | }
29 |
30 | // same as AcquireToken in Console https://github.com/redpanda-data/console/blob/master/backend/pkg/config/kafka_sasl_oauth.go#L56
31 | func (c *OAuthBearerConfig) getToken(ctx context.Context) (string, error) {
32 | authHeaderValue := base64.StdEncoding.EncodeToString([]byte(c.ClientID + ":" + c.ClientSecret))
33 |
34 | queryParams := url.Values{
35 | "grant_type": []string{"client_credentials"},
36 | "scope": []string{c.Scope},
37 | }
38 |
39 | req, err := http.NewRequestWithContext(ctx, "POST", c.TokenEndpoint, strings.NewReader(queryParams.Encode()))
40 | if err != nil {
41 | return "", fmt.Errorf("failed to create HTTP request: %w", err)
42 | }
43 |
44 | req.URL.RawQuery = queryParams.Encode()
45 |
46 | req.Header.Set("Authorization", "Basic "+authHeaderValue)
47 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
48 |
49 | client := &http.Client{}
50 |
51 | resp, err := client.Do(req)
52 | if err != nil {
53 | return "", fmt.Errorf("HTTP request failed: %w", err)
54 | }
55 | defer resp.Body.Close()
56 |
57 | if resp.StatusCode != http.StatusOK {
58 | return "", fmt.Errorf("token request failed with status code %d", resp.StatusCode)
59 | }
60 |
61 | var tokenResponse map[string]interface{}
62 | decoder := json.NewDecoder(resp.Body)
63 | if err := decoder.Decode(&tokenResponse); err != nil {
64 | return "", fmt.Errorf("failed to parse token response: %w", err)
65 | }
66 |
67 | accessToken, ok := tokenResponse["access_token"].(string)
68 | if !ok {
69 | return "", fmt.Errorf("access_token not found in token response")
70 | }
71 |
72 | return accessToken, nil
73 | }
74 |
--------------------------------------------------------------------------------
/kafka/config_tls.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import "fmt"
4 |
5 | // TLSConfig to connect to Kafka via TLS
6 | type TLSConfig struct {
7 | Enabled bool `koanf:"enabled"`
8 | CaFilepath string `koanf:"caFilepath"`
9 | CertFilepath string `koanf:"certFilepath"`
10 | KeyFilepath string `koanf:"keyFilepath"`
11 | Ca string `koanf:"ca"`
12 | Cert string `koanf:"cert"`
13 | Key string `koanf:"key"`
14 | Passphrase string `koanf:"passphrase"`
15 | InsecureSkipTLSVerify bool `koanf:"insecureSkipTlsVerify"`
16 | }
17 |
18 | func (c *TLSConfig) SetDefaults() {
19 | c.Enabled = false
20 | }
21 |
22 | func (c *TLSConfig) Validate() error {
23 | if len(c.CaFilepath) > 0 && len(c.Ca) > 0 {
24 | return fmt.Errorf("config keys 'caFilepath' and 'ca' are both set. only one can be used at the same time")
25 | }
26 | if len(c.CertFilepath) > 0 && len(c.Cert) > 0 {
27 | return fmt.Errorf("config keys 'certFilepath' and 'cert' are both set. only one can be used at the same time")
28 | }
29 |
30 | if len(c.KeyFilepath) > 0 && len(c.Key) > 0 {
31 | return fmt.Errorf("config keys 'keyFilepath' and 'key' are both set. only one can be used at the same time")
32 | }
33 | return nil
34 | }
35 |
--------------------------------------------------------------------------------
/kafka/service.go:
--------------------------------------------------------------------------------
1 | package kafka
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kerr"
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "github.com/twmb/franz-go/pkg/kversion"
12 | "go.uber.org/zap"
13 | )
14 |
15 | type Service struct {
16 | cfg Config
17 | logger *zap.Logger
18 | }
19 |
20 | func NewService(cfg Config, logger *zap.Logger) *Service {
21 | return &Service{
22 | cfg: cfg,
23 | logger: logger.Named("kafka_service"),
24 | }
25 | }
26 |
27 | // CreateAndTestClient creates a client with the services default settings
28 | // logger: will be used to log connections, errors, warnings about tls config, ...
29 | func (s *Service) CreateAndTestClient(ctx context.Context, l *zap.Logger, opts []kgo.Opt) (*kgo.Client, error) {
30 | logger := l.Named("kgo_client")
31 | // Config with default options
32 | kgoOpts, err := NewKgoConfig(s.cfg, logger)
33 | if err != nil {
34 | return nil, fmt.Errorf("failed to create a valid kafka Client config: %w", err)
35 | }
36 | // Append user (the service calling this method) provided options
37 | kgoOpts = append(kgoOpts, opts...)
38 |
39 | // Create kafka client
40 | client, err := kgo.NewClient(kgoOpts...)
41 | if err != nil {
42 | return nil, fmt.Errorf("failed to create kafka Client: %w", err)
43 | }
44 |
45 | // Test connection
46 | for {
47 | err = s.testConnection(client, ctx)
48 | if err == nil {
49 | break
50 | }
51 |
52 | if !s.cfg.RetryInitConnection {
53 | return nil, fmt.Errorf("failed to test connectivity to Kafka cluster %w", err)
54 | }
55 |
56 | logger.Warn("failed to test connectivity to Kafka cluster, retrying in 5 seconds", zap.Error(err))
57 | time.Sleep(time.Second * 5)
58 | }
59 |
60 | return client, nil
61 | }
62 |
63 | // Brokers returns list of brokers this service is connecting to
64 | func (s *Service) Brokers() []string {
65 | return s.cfg.Brokers
66 | }
67 |
68 | // testConnection tries to fetch Broker metadata and prints some information if connection succeeds. An error will be
69 | // returned if connecting fails.
70 | func (s *Service) testConnection(client *kgo.Client, ctx context.Context) error {
71 | connectCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
72 | defer cancel()
73 |
74 | req := kmsg.MetadataRequest{
75 | Topics: nil,
76 | }
77 | res, err := req.RequestWith(connectCtx, client)
78 | if err != nil {
79 | return fmt.Errorf("failed to request metadata: %w", err)
80 | }
81 |
82 | // Request versions in order to guess Kafka Cluster version
83 | versionsReq := kmsg.NewApiVersionsRequest()
84 | versionsRes, err := versionsReq.RequestWith(connectCtx, client)
85 | if err != nil {
86 | return fmt.Errorf("failed to request api versions: %w", err)
87 | }
88 | err = kerr.ErrorForCode(versionsRes.ErrorCode)
89 | if err != nil {
90 | return fmt.Errorf("failed to request api versions. Inner kafka error: %w", err)
91 | }
92 | versions := kversion.FromApiVersionsResponse(versionsRes)
93 |
94 | s.logger.Debug("successfully connected to kafka cluster",
95 | zap.Int("advertised_broker_count", len(res.Brokers)),
96 | zap.Int("topic_count", len(res.Topics)),
97 | zap.Int32("controller_id", res.ControllerID),
98 | zap.String("kafka_version", versions.VersionGuess()))
99 |
100 | return nil
101 | }
102 |
--------------------------------------------------------------------------------
/logging/config.go:
--------------------------------------------------------------------------------
1 | package logging
2 |
3 | import (
4 | "fmt"
5 | "go.uber.org/zap"
6 | )
7 |
8 | type Config struct {
9 | Level string `koanf:"level"`
10 | }
11 |
12 | func (c *Config) SetDefaults() {
13 | c.Level = "info"
14 | }
15 |
16 | func (c *Config) Validate() error {
17 | level := zap.NewAtomicLevel()
18 | err := level.UnmarshalText([]byte(c.Level))
19 | if err != nil {
20 | return fmt.Errorf("failed to parse logger level: %w", err)
21 | }
22 |
23 | return nil
24 | }
25 |
--------------------------------------------------------------------------------
/logging/logger.go:
--------------------------------------------------------------------------------
1 | package logging
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/prometheus/client_golang/prometheus"
7 | "github.com/prometheus/client_golang/prometheus/promauto"
8 | "go.uber.org/zap/zapcore"
9 |
10 | "go.uber.org/zap"
11 | )
12 |
13 | // NewLogger creates a preconfigured global logger and configures the global zap logger
14 | func NewLogger(cfg Config, metricsNamespace string) *zap.Logger {
15 | encoderCfg := zap.NewProductionEncoderConfig()
16 | encoderCfg.EncodeTime = zapcore.ISO8601TimeEncoder
17 |
18 | // Parse log level text to zap.LogLevel. Error check isn't required because the input is already validated.
19 | level := zap.NewAtomicLevel()
20 | _ = level.UnmarshalText([]byte(cfg.Level))
21 |
22 | core := zapcore.NewCore(
23 | zapcore.NewJSONEncoder(encoderCfg),
24 | zapcore.Lock(os.Stdout),
25 | level,
26 | )
27 | core = zapcore.RegisterHooks(core, prometheusHook(metricsNamespace))
28 | logger := zap.New(core)
29 | zap.ReplaceGlobals(logger)
30 |
31 | return logger
32 | }
33 |
34 | // prometheusHook is a hook for the zap library which exposes Prometheus counters for various log levels.
35 | func prometheusHook(metricsNamespace string) func(zapcore.Entry) error {
36 | messageCounterVec := promauto.NewCounterVec(prometheus.CounterOpts{
37 | Namespace: metricsNamespace,
38 | Name: "log_messages_total",
39 | Help: "Total number of log messages by log level emitted by KMinion.",
40 | }, []string{"level"})
41 |
42 | // Initialize counters for all supported log levels so that they expose 0 for each level on startup
43 | supportedLevels := []zapcore.Level{
44 | zapcore.DebugLevel,
45 | zapcore.InfoLevel,
46 | zapcore.WarnLevel,
47 | zapcore.ErrorLevel,
48 | zapcore.FatalLevel,
49 | zapcore.PanicLevel,
50 | }
51 | for _, level := range supportedLevels {
52 | messageCounterVec.WithLabelValues(level.String())
53 | }
54 |
55 | return func(entry zapcore.Entry) error {
56 | messageCounterVec.WithLabelValues(entry.Level.String()).Inc()
57 | return nil
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "net"
8 | "net/http"
9 | "os"
10 | "os/signal"
11 | "strconv"
12 |
13 | "github.com/cloudhut/kminion/v2/e2e"
14 | "github.com/cloudhut/kminion/v2/kafka"
15 | "github.com/cloudhut/kminion/v2/logging"
16 | "github.com/cloudhut/kminion/v2/minion"
17 | "github.com/cloudhut/kminion/v2/prometheus"
18 | promclient "github.com/prometheus/client_golang/prometheus"
19 | "github.com/prometheus/client_golang/prometheus/promhttp"
20 | "go.uber.org/zap"
21 | )
22 |
23 | var (
24 | // ------------------------------------------------------------------------
25 | // Below parameters are set at build time using ldflags.
26 | // ------------------------------------------------------------------------
27 |
28 | // version is KMinion's SemVer version (for example: v1.0.0).
29 | version = "development"
30 | // builtAt is a string that represent a human-readable date when the binary was built.
31 | builtAt = "N/A"
32 | // commit is a string that represents the last git commit for this build.
33 | commit = "N/A"
34 | )
35 |
36 | func main() {
37 | startupLogger, err := zap.NewProduction()
38 | if err != nil {
39 | panic(fmt.Errorf("failed to create startup logger: %w", err))
40 | }
41 |
42 | cfg, err := newConfig(startupLogger)
43 | if err != nil {
44 | startupLogger.Fatal("failed to parse config", zap.Error(err))
45 | }
46 |
47 | logger := logging.NewLogger(cfg.Logger, cfg.Exporter.Namespace).Named("main")
48 | if err != nil {
49 | startupLogger.Fatal("failed to create new logger", zap.Error(err))
50 | }
51 |
52 | logger.Info("started kminion", zap.String("version", version), zap.String("built_at", builtAt))
53 |
54 | // Setup context that stops when the application receives an interrupt signal
55 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
56 | defer stop()
57 |
58 | wrappedRegisterer := promclient.WrapRegistererWithPrefix(cfg.Exporter.Namespace+"_", promclient.DefaultRegisterer)
59 |
60 | // Create kafka service
61 | kafkaSvc := kafka.NewService(cfg.Kafka, logger)
62 |
63 | // Create minion service
64 | // Prometheus exporter only talks to the minion service which
65 | // issues all the requests to Kafka and wraps the interface accordingly.
66 | minionSvc, err := minion.NewService(cfg.Minion, logger, kafkaSvc, cfg.Exporter.Namespace, ctx)
67 | if err != nil {
68 | logger.Fatal("failed to setup minion service", zap.Error(err))
69 | }
70 |
71 | err = minionSvc.Start(ctx)
72 | if err != nil {
73 | logger.Fatal("failed to start minion service", zap.Error(err))
74 | }
75 |
76 | // Create end to end testing service
77 | if cfg.Minion.EndToEnd.Enabled {
78 | e2eService, err := e2e.NewService(
79 | ctx,
80 | cfg.Minion.EndToEnd,
81 | logger,
82 | kafkaSvc,
83 | wrappedRegisterer,
84 | )
85 | if err != nil {
86 | logger.Fatal("failed to create end-to-end monitoring service: %w", zap.Error(err))
87 | }
88 |
89 | if err = e2eService.Start(ctx); err != nil {
90 | logger.Fatal("failed to start end-to-end monitoring service", zap.Error(err))
91 | }
92 | }
93 |
94 | // The Prometheus exporter that implements the Prometheus collector interface
95 | exporter, err := prometheus.NewExporter(cfg.Exporter, logger, minionSvc)
96 | if err != nil {
97 | logger.Fatal("failed to setup prometheus exporter", zap.Error(err))
98 | }
99 | exporter.InitializeMetrics()
100 |
101 | promclient.MustRegister(exporter)
102 | http.Handle("/metrics",
103 | promhttp.InstrumentMetricHandler(
104 | promclient.DefaultRegisterer,
105 | promhttp.HandlerFor(
106 | promclient.DefaultGatherer,
107 | promhttp.HandlerOpts{},
108 | ),
109 | ),
110 | )
111 | http.Handle("/ready", minionSvc.HandleIsReady())
112 |
113 | // Start HTTP server
114 | address := net.JoinHostPort(cfg.Exporter.Host, strconv.Itoa(cfg.Exporter.Port))
115 | srv := &http.Server{Addr: address}
116 | go func() {
117 | <-ctx.Done()
118 | if err := srv.Shutdown(context.Background()); err != nil {
119 | logger.Error("error stopping HTTP server", zap.Error(err))
120 | os.Exit(1)
121 | }
122 | }()
123 | logger.Info("listening on address", zap.String("listen_address", address))
124 | if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
125 | logger.Error("error starting HTTP server", zap.Error(err))
126 | os.Exit(1)
127 | }
128 |
129 | logger.Info("kminion stopped")
130 | }
131 |
--------------------------------------------------------------------------------
/minion/client_hooks.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "net"
5 | "time"
6 |
7 | "github.com/prometheus/client_golang/prometheus"
8 | "github.com/prometheus/client_golang/prometheus/promauto"
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "go.uber.org/zap"
11 | )
12 |
13 | // clientHooks implements the various hook interfaces from the franz-go (kafka) library. We can use these hooks to
14 | // log additional information, collect Prometheus metrics and similar.
15 | type clientHooks struct {
16 | logger *zap.Logger
17 |
18 | requestSentCount prometheus.Counter
19 | bytesSent prometheus.Counter
20 |
21 | requestsReceivedCount prometheus.Counter
22 | bytesReceived prometheus.Counter
23 | }
24 |
25 | func newMinionClientHooks(logger *zap.Logger, metricsNamespace string) *clientHooks {
26 | requestSentCount := promauto.NewCounter(prometheus.CounterOpts{
27 | Namespace: metricsNamespace,
28 | Subsystem: "kafka",
29 | Name: "requests_sent_total"})
30 | bytesSent := promauto.NewCounter(prometheus.CounterOpts{
31 | Namespace: metricsNamespace,
32 | Subsystem: "kafka",
33 | Name: "sent_bytes",
34 | })
35 |
36 | requestsReceivedCount := promauto.NewCounter(prometheus.CounterOpts{
37 | Namespace: metricsNamespace,
38 | Subsystem: "kafka",
39 | Name: "requests_received_total"})
40 | bytesReceived := promauto.NewCounter(prometheus.CounterOpts{
41 | Namespace: metricsNamespace,
42 | Subsystem: "kafka",
43 | Name: "received_bytes",
44 | })
45 |
46 | return &clientHooks{
47 | logger: logger,
48 |
49 | requestSentCount: requestSentCount,
50 | bytesSent: bytesSent,
51 |
52 | requestsReceivedCount: requestsReceivedCount,
53 | bytesReceived: bytesReceived,
54 | }
55 | }
56 |
57 | func (c clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) {
58 | if err != nil {
59 | c.logger.Debug("kafka connection failed", zap.String("broker_host", meta.Host), zap.Error(err))
60 | return
61 | }
62 | c.logger.Debug("kafka connection succeeded",
63 | zap.String("host", meta.Host),
64 | zap.Duration("dial_duration", dialDur))
65 | }
66 |
67 | func (c clientHooks) OnBrokerDisconnect(meta kgo.BrokerMetadata, _ net.Conn) {
68 | c.logger.Debug("kafka broker disconnected",
69 | zap.String("host", meta.Host))
70 | }
71 |
72 | // OnBrokerRead is passed the broker metadata, the key for the response that
73 | // was read, the number of bytes read, how long the Client waited
74 | // before reading the response, how long it took to read the response,
75 | // and any error.
76 | //
77 | // The bytes written does not count any tls overhead.
78 | // OnRead is called after a read from a broker.
79 | func (c clientHooks) OnBrokerRead(_ kgo.BrokerMetadata, _ int16, bytesRead int, _, _ time.Duration, _ error) {
80 | c.requestsReceivedCount.Inc()
81 | c.bytesReceived.Add(float64(bytesRead))
82 | }
83 |
84 | // OnBrokerWrite is passed the broker metadata, the key for the request that
85 | // was written, the number of bytes written, how long the request
86 | // waited before being written, how long it took to write the request,
87 | // and any error.
88 | //
89 | // The bytes written does not count any tls overhead.
90 | // OnWrite is called after a write to a broker.
91 | func (c clientHooks) OnBrokerWrite(_ kgo.BrokerMetadata, _ int16, bytesWritten int, _, _ time.Duration, _ error) {
92 | c.requestSentCount.Inc()
93 | c.bytesSent.Add(float64(bytesWritten))
94 | }
95 |
--------------------------------------------------------------------------------
/minion/config.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/cloudhut/kminion/v2/e2e"
7 | )
8 |
9 | type Config struct {
10 | ConsumerGroups ConsumerGroupConfig `koanf:"consumerGroups"`
11 | Topics TopicConfig `koanf:"topics"`
12 | LogDirs LogDirsConfig `koanf:"logDirs"`
13 | EndToEnd e2e.Config `koanf:"endToEnd"`
14 | }
15 |
16 | func (c *Config) SetDefaults() {
17 | c.ConsumerGroups.SetDefaults()
18 | c.Topics.SetDefaults()
19 | c.LogDirs.SetDefaults()
20 | c.EndToEnd.SetDefaults()
21 | }
22 |
23 | func (c *Config) Validate() error {
24 | err := c.ConsumerGroups.Validate()
25 | if err != nil {
26 | return fmt.Errorf("failed to consumer group config: %w", err)
27 | }
28 |
29 | err = c.Topics.Validate()
30 | if err != nil {
31 | return fmt.Errorf("failed to validate topic config: %w", err)
32 | }
33 |
34 | err = c.LogDirs.Validate()
35 | if err != nil {
36 | return fmt.Errorf("failed to validate log dirs config: %w", err)
37 | }
38 |
39 | err = c.EndToEnd.Validate()
40 | if err != nil {
41 | return fmt.Errorf("failed to validate endToEnd config: %w", err)
42 | }
43 |
44 | return nil
45 | }
46 |
--------------------------------------------------------------------------------
/minion/config_consumer_group.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "fmt"
5 | )
6 |
7 | const (
8 | ConsumerGroupScrapeModeOffsetsTopic string = "offsetsTopic"
9 | ConsumerGroupScrapeModeAdminAPI string = "adminApi"
10 |
11 | ConsumerGroupGranularityTopic string = "topic"
12 | ConsumerGroupGranularityPartition string = "partition"
13 | )
14 |
15 | type ConsumerGroupConfig struct {
16 | // Enabled specifies whether consumer groups shall be scraped and exported or not.
17 | Enabled bool `koanf:"enabled"`
18 |
19 | // Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal
20 | // __consumer_offsets topic.
21 | ScrapeMode string `koanf:"scrapeMode"`
22 |
23 | // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
24 | // you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed
25 | // and only topic lags will be exported.
26 | Granularity string `koanf:"granularity"`
27 |
28 | // AllowedGroups are regex strings of group ids that shall be exported
29 | AllowedGroupIDs []string `koanf:"allowedGroups"`
30 |
31 | // IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups
32 | // take precedence over allowed groups.
33 | IgnoredGroupIDs []string `koanf:"ignoredGroups"`
34 | }
35 |
36 | func (c *ConsumerGroupConfig) SetDefaults() {
37 | c.Enabled = true
38 | c.ScrapeMode = ConsumerGroupScrapeModeAdminAPI
39 | c.Granularity = ConsumerGroupGranularityPartition
40 | c.AllowedGroupIDs = []string{"/.*/"}
41 | }
42 |
43 | func (c *ConsumerGroupConfig) Validate() error {
44 | switch c.ScrapeMode {
45 | case ConsumerGroupScrapeModeOffsetsTopic, ConsumerGroupScrapeModeAdminAPI:
46 | default:
47 | return fmt.Errorf("invalid scrape mode '%v' specified. Valid modes are '%v' or '%v'",
48 | c.ScrapeMode,
49 | ConsumerGroupScrapeModeOffsetsTopic,
50 | ConsumerGroupScrapeModeAdminAPI)
51 | }
52 |
53 | switch c.Granularity {
54 | case ConsumerGroupGranularityTopic, ConsumerGroupGranularityPartition:
55 | default:
56 | return fmt.Errorf("invalid consumer group granularity '%v' specified. Valid modes are '%v' or '%v'",
57 | c.Granularity,
58 | ConsumerGroupGranularityTopic,
59 | ConsumerGroupGranularityPartition)
60 | }
61 |
62 | // Check if all group strings are valid regex or literals
63 | for _, groupID := range c.AllowedGroupIDs {
64 | _, err := compileRegex(groupID)
65 | if err != nil {
66 | return fmt.Errorf("allowed group string '%v' is not valid regex", groupID)
67 | }
68 | }
69 |
70 | for _, groupID := range c.IgnoredGroupIDs {
71 | _, err := compileRegex(groupID)
72 | if err != nil {
73 | return fmt.Errorf("ignored group string '%v' is not valid regex", groupID)
74 | }
75 | }
76 |
77 | return nil
78 | }
79 |
--------------------------------------------------------------------------------
/minion/config_log_dirs.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | type LogDirsConfig struct {
4 | // Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior
5 | // to version 1.0.0 as describing log dirs was not supported back then.
6 | Enabled bool `koanf:"enabled"`
7 | }
8 |
9 | // Validate if provided LogDirsConfig is valid.
10 | func (c *LogDirsConfig) Validate() error {
11 | return nil
12 | }
13 |
14 | // SetDefaults for topic config
15 | func (c *LogDirsConfig) SetDefaults() {
16 | c.Enabled = true
17 | }
18 |
--------------------------------------------------------------------------------
/minion/config_topic_config.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "fmt"
5 | )
6 |
7 | const (
8 | TopicGranularityTopic string = "topic"
9 | TopicGranularityPartition string = "partition"
10 | )
11 |
12 | type TopicConfig struct {
13 | // Enabled can be set to false in order to not collect any topic metrics at all.
14 | Enabled bool `koanf:"enabled"`
15 |
16 | // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
17 | // you aren't interested in per partition metrics you could choose "topic".
18 | Granularity string `koanf:"granularity"`
19 |
20 | // AllowedTopics are regex strings of topic names whose topic metrics that shall be exported.
21 | AllowedTopics []string `koanf:"allowedTopics"`
22 |
23 | // IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics
24 | // take precedence over allowed topics.
25 | IgnoredTopics []string `koanf:"ignoredTopics"`
26 |
27 | // InfoMetric configures how the kafka_topic_info metric is populated
28 | InfoMetric InfoMetricConfig `koanf:"infoMetric"`
29 | }
30 |
31 | type InfoMetricConfig struct {
32 | // ConfigKeys configures optional topic configuration keys that should be exported
33 | // as prometheus metric labels.
34 | // By default only "cleanup.policy" is exported
35 | ConfigKeys []string `koanf:"configKeys"`
36 | }
37 |
38 | // Validate if provided TopicConfig is valid.
39 | func (c *TopicConfig) Validate() error {
40 | switch c.Granularity {
41 | case TopicGranularityPartition, TopicGranularityTopic:
42 | default:
43 | return fmt.Errorf("given granularity '%v' is invalid", c.Granularity)
44 | }
45 |
46 | // Check whether each provided string is valid regex
47 | for _, topic := range c.AllowedTopics {
48 | _, err := compileRegex(topic)
49 | if err != nil {
50 | return fmt.Errorf("allowed topic string '%v' is not valid regex", topic)
51 | }
52 | }
53 |
54 | for _, topic := range c.IgnoredTopics {
55 | _, err := compileRegex(topic)
56 | if err != nil {
57 | return fmt.Errorf("ignored topic string '%v' is not valid regex", topic)
58 | }
59 | }
60 |
61 | return nil
62 | }
63 |
64 | // SetDefaults for topic config
65 | func (c *TopicConfig) SetDefaults() {
66 | c.Enabled = true
67 | c.Granularity = TopicGranularityPartition
68 | c.AllowedTopics = []string{"/.*/"}
69 | c.InfoMetric = InfoMetricConfig{ConfigKeys: []string{"cleanup.policy"}}
70 | }
71 |
--------------------------------------------------------------------------------
/minion/consumer_group_offsets.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "sync"
7 |
8 | "github.com/twmb/franz-go/pkg/kmsg"
9 | "go.uber.org/zap"
10 | "golang.org/x/sync/errgroup"
11 | )
12 |
13 | // ListAllConsumerGroupOffsetsInternal returns a map from the in memory storage. The map value is the offset commit
14 | // value and is grouped by group id, topic, partition id as keys of the nested maps.
15 | func (s *Service) ListAllConsumerGroupOffsetsInternal() map[string]map[string]map[int32]OffsetCommit {
16 | return s.storage.getGroupOffsets()
17 | }
18 |
19 | // ListAllConsumerGroupOffsetsAdminAPI return all consumer group offsets using Kafka's Admin API.
20 | func (s *Service) ListAllConsumerGroupOffsetsAdminAPI(ctx context.Context) (map[string]*kmsg.OffsetFetchResponse, error) {
21 | groupsRes, err := s.listConsumerGroupsCached(ctx)
22 | if err != nil {
23 | return nil, fmt.Errorf("failed to list groupsRes: %w", err)
24 | }
25 | groupIDs := make([]string, len(groupsRes.Groups))
26 | for i, group := range groupsRes.Groups {
27 | groupIDs[i] = group.Group
28 | }
29 |
30 | return s.listConsumerGroupOffsetsBulk(ctx, groupIDs)
31 | }
32 |
33 | // listConsumerGroupOffsetsBulk returns a map which has the Consumer group name as key
34 | func (s *Service) listConsumerGroupOffsetsBulk(ctx context.Context, groups []string) (map[string]*kmsg.OffsetFetchResponse, error) {
35 | eg, _ := errgroup.WithContext(ctx)
36 |
37 | mutex := sync.Mutex{}
38 | res := make(map[string]*kmsg.OffsetFetchResponse)
39 |
40 | f := func(group string) func() error {
41 | return func() error {
42 | offsets, err := s.listConsumerGroupOffsets(ctx, group)
43 | if err != nil {
44 | s.logger.Warn("failed to fetch consumer group offsets, inner kafka error",
45 | zap.String("consumer_group", group),
46 | zap.Error(err))
47 | return nil
48 | }
49 |
50 | mutex.Lock()
51 | res[group] = offsets
52 | mutex.Unlock()
53 | return nil
54 | }
55 | }
56 |
57 | for _, group := range groups {
58 | eg.Go(f(group))
59 | }
60 |
61 | if err := eg.Wait(); err != nil {
62 | return nil, err
63 | }
64 |
65 | return res, nil
66 | }
67 |
68 | // listConsumerGroupOffsets returns the committed group offsets for a single group
69 | func (s *Service) listConsumerGroupOffsets(ctx context.Context, group string) (*kmsg.OffsetFetchResponse, error) {
70 | req := kmsg.NewOffsetFetchRequest()
71 | req.Group = group
72 | req.Topics = nil
73 | res, err := req.RequestWith(ctx, s.client)
74 | if err != nil {
75 | return nil, fmt.Errorf("failed to request group offsets for group '%v': %w", group, err)
76 | }
77 |
78 | return res, nil
79 | }
80 |
--------------------------------------------------------------------------------
/minion/describe_consumer_groups.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kerr"
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/zap"
12 | )
13 |
14 | type DescribeConsumerGroupsResponse struct {
15 | BrokerMetadata kgo.BrokerMetadata
16 | Groups *kmsg.DescribeGroupsResponse
17 | }
18 |
19 | func (s *Service) listConsumerGroupsCached(ctx context.Context) (*kmsg.ListGroupsResponse, error) {
20 | reqId := ctx.Value("requestId").(string)
21 | key := "list-consumer-groups-" + reqId
22 |
23 | if cachedRes, exists := s.getCachedItem(key); exists {
24 | return cachedRes.(*kmsg.ListGroupsResponse), nil
25 | }
26 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) {
27 | res, err := s.listConsumerGroups(ctx)
28 | if err != nil {
29 | return nil, err
30 | }
31 | s.setCachedItem(key, res, 120*time.Second)
32 |
33 | return res, nil
34 | })
35 | if err != nil {
36 | return nil, err
37 | }
38 |
39 | return res.(*kmsg.ListGroupsResponse), nil
40 | }
41 |
42 | func (s *Service) listConsumerGroups(ctx context.Context) (*kmsg.ListGroupsResponse, error) {
43 | listReq := kmsg.NewListGroupsRequest()
44 | res, err := listReq.RequestWith(ctx, s.client)
45 | if err != nil {
46 | return nil, fmt.Errorf("failed to list consumer groups: %w", err)
47 | }
48 | err = kerr.ErrorForCode(res.ErrorCode)
49 | if err != nil {
50 | return nil, fmt.Errorf("failed to list consumer groups. inner kafka error: %w", err)
51 | }
52 |
53 | return res, nil
54 | }
55 |
56 | func (s *Service) DescribeConsumerGroups(ctx context.Context) ([]DescribeConsumerGroupsResponse, error) {
57 | listRes, err := s.listConsumerGroupsCached(ctx)
58 | if err != nil {
59 | return nil, err
60 | }
61 |
62 | groupIDs := make([]string, len(listRes.Groups))
63 | for i, group := range listRes.Groups {
64 | groupIDs[i] = group.Group
65 | }
66 |
67 | describeReq := kmsg.NewDescribeGroupsRequest()
68 | describeReq.Groups = groupIDs
69 | describeReq.IncludeAuthorizedOperations = false
70 | shardedResp := s.client.RequestSharded(ctx, &describeReq)
71 |
72 | describedGroups := make([]DescribeConsumerGroupsResponse, 0)
73 | for _, kresp := range shardedResp {
74 | if kresp.Err != nil {
75 | s.logger.Warn("broker failed to respond to the described groups request",
76 | zap.Int32("broker_id", kresp.Meta.NodeID),
77 | zap.Error(kresp.Err))
78 | continue
79 | }
80 | res := kresp.Resp.(*kmsg.DescribeGroupsResponse)
81 |
82 | describedGroups = append(describedGroups, DescribeConsumerGroupsResponse{
83 | BrokerMetadata: kresp.Meta,
84 | Groups: res,
85 | })
86 | }
87 |
88 | return describedGroups, nil
89 | }
90 |
--------------------------------------------------------------------------------
/minion/describe_topic_config.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 |
7 | "github.com/pkg/errors"
8 | "github.com/twmb/franz-go/pkg/kmsg"
9 | )
10 |
11 | func (s *Service) GetTopicConfigs(ctx context.Context) (*kmsg.DescribeConfigsResponse, error) {
12 | metadata, err := s.GetMetadataCached(ctx)
13 | if err != nil {
14 | return nil, errors.Wrap(err, "failed to get metadata")
15 | }
16 |
17 | req := kmsg.NewDescribeConfigsRequest()
18 |
19 | for _, topic := range metadata.Topics {
20 | resourceReq := kmsg.NewDescribeConfigsRequestResource()
21 | resourceReq.ResourceType = kmsg.ConfigResourceTypeTopic
22 | resourceReq.ResourceName = *topic.Topic
23 | req.Resources = append(req.Resources, resourceReq)
24 | }
25 |
26 | res, err := req.RequestWith(ctx, s.client)
27 | if err != nil {
28 | return nil, fmt.Errorf("failed to request metadata: %w", err)
29 | }
30 |
31 | return res, nil
32 | }
33 |
--------------------------------------------------------------------------------
/minion/list_offsets.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "strconv"
8 | "time"
9 |
10 | "github.com/twmb/franz-go/pkg/kadm"
11 | "go.uber.org/zap"
12 | )
13 |
14 | func (s *Service) ListOffsetsCached(ctx context.Context, timestamp int64) (kadm.ListedOffsets, error) {
15 | reqId := ctx.Value("requestId").(string)
16 | key := "partition-offsets-" + strconv.Itoa(int(timestamp)) + "-" + reqId
17 |
18 | if cachedRes, exists := s.getCachedItem(key); exists {
19 | return cachedRes.(kadm.ListedOffsets), nil
20 | }
21 |
22 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) {
23 | offsets, err := s.ListOffsets(ctx, timestamp)
24 | if err != nil {
25 | return nil, err
26 | }
27 |
28 | s.setCachedItem(key, offsets, 120*time.Second)
29 |
30 | return offsets, nil
31 | })
32 | if err != nil {
33 | return nil, err
34 | }
35 |
36 | return res.(kadm.ListedOffsets), nil
37 | }
38 |
39 | // ListOffsets fetches the low (timestamp: -2) or high water mark (timestamp: -1) for all topic partitions
40 | func (s *Service) ListOffsets(ctx context.Context, timestamp int64) (kadm.ListedOffsets, error) {
41 | listedOffsets, err := s.admClient.ListEndOffsets(ctx)
42 | if err != nil {
43 | var se *kadm.ShardErrors
44 | if !errors.As(err, &se) {
45 | return nil, fmt.Errorf("failed to list offsets: %w", err)
46 | }
47 |
48 | if se.AllFailed {
49 | return nil, fmt.Errorf("failed to list offsets, all shard responses failed: %w", err)
50 | }
51 | s.logger.Info("failed to list offset from some shards", zap.Int("failed_shards", len(se.Errs)))
52 | for _, shardErr := range se.Errs {
53 | s.logger.Warn("shard error for listing end offsets",
54 | zap.Int32("broker_id", shardErr.Broker.NodeID),
55 | zap.Error(shardErr.Err))
56 | }
57 | }
58 |
59 | // Log inner errors before returning them. We do that inside of this function to avoid duplicate logging as the response
60 | // are cached for each scrape anyways.
61 | //
62 | // Create two metrics to aggregate error logs in few messages. Logging one message per occured partition error
63 | // is too much. Typical errors are LEADER_NOT_AVAILABLE etc.
64 | errorCountByErrCode := make(map[error]int)
65 | errorCountByTopic := make(map[string]int)
66 |
67 | // Iterate on all partitions
68 | listedOffsets.Each(func(offset kadm.ListedOffset) {
69 | if offset.Err != nil {
70 | errorCountByTopic[offset.Topic]++
71 | errorCountByErrCode[offset.Err]++
72 | }
73 | })
74 |
75 | // Print log line for each error type
76 | for err, count := range errorCountByErrCode {
77 | s.logger.Warn("failed to list some partitions watermarks",
78 | zap.Error(err),
79 | zap.Int("error_count", count))
80 | }
81 | if len(errorCountByTopic) > 0 {
82 | s.logger.Warn("some topics had one or more partitions whose watermarks could not be fetched from Kafka",
83 | zap.Int("topics_with_errors", len(errorCountByTopic)))
84 | }
85 |
86 | return listedOffsets, nil
87 | }
88 |
--------------------------------------------------------------------------------
/minion/log_dirs.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 |
6 | "github.com/twmb/franz-go/pkg/kgo"
7 | "github.com/twmb/franz-go/pkg/kmsg"
8 | )
9 |
10 | type LogDirResponseShard struct {
11 | Err error
12 | Broker kgo.BrokerMetadata
13 | LogDirs *kmsg.DescribeLogDirsResponse
14 | }
15 |
16 | func (s *Service) DescribeLogDirs(ctx context.Context) []LogDirResponseShard {
17 | req := kmsg.NewDescribeLogDirsRequest()
18 | req.Topics = nil // Describe all topics
19 | responses := s.client.RequestSharded(ctx, &req)
20 |
21 | res := make([]LogDirResponseShard, len(responses))
22 | for i, responseShard := range responses {
23 | logDirs, ok := responseShard.Resp.(*kmsg.DescribeLogDirsResponse)
24 | if !ok {
25 | logDirs = &kmsg.DescribeLogDirsResponse{}
26 | }
27 |
28 | res[i] = LogDirResponseShard{
29 | Err: responseShard.Err,
30 | Broker: responseShard.Meta,
31 | LogDirs: logDirs,
32 | }
33 | }
34 |
35 | return res
36 | }
37 |
--------------------------------------------------------------------------------
/minion/metadata.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kmsg"
9 | )
10 |
11 | func (s *Service) GetMetadataCached(ctx context.Context) (*kmsg.MetadataResponse, error) {
12 | reqId := ctx.Value("requestId").(string)
13 | key := "metadata-" + reqId
14 |
15 | if cachedRes, exists := s.getCachedItem(key); exists {
16 | return cachedRes.(*kmsg.MetadataResponse), nil
17 | }
18 |
19 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) {
20 | metadata, err := s.GetMetadata(ctx)
21 | if err != nil {
22 | return nil, err
23 | }
24 |
25 | s.setCachedItem(key, metadata, 120*time.Second)
26 |
27 | return metadata, nil
28 | })
29 | if err != nil {
30 | return nil, err
31 | }
32 |
33 | return res.(*kmsg.MetadataResponse), nil
34 | }
35 |
36 | func (s *Service) GetMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) {
37 | req := kmsg.NewMetadataRequest()
38 | req.Topics = nil
39 |
40 | res, err := req.RequestWith(ctx, s.client)
41 | if err != nil {
42 | return nil, fmt.Errorf("failed to request metadata: %w", err)
43 | }
44 |
45 | return res, nil
46 | }
47 |
--------------------------------------------------------------------------------
/minion/offset_consumer.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "time"
7 |
8 | "github.com/twmb/franz-go/pkg/kbin"
9 | "github.com/twmb/franz-go/pkg/kerr"
10 | "github.com/twmb/franz-go/pkg/kgo"
11 | "github.com/twmb/franz-go/pkg/kmsg"
12 | "go.uber.org/zap"
13 | )
14 |
15 | // startConsumingOffsets consumes the __consumer_offsets topic and forwards the kafka messages to their respective
16 | // methods where they'll be decoded and further processed.
17 | func (s *Service) startConsumingOffsets(ctx context.Context) {
18 | client := s.client
19 |
20 | s.logger.Info("starting to consume messages from offsets topic")
21 | go s.checkIfConsumerLagIsCaughtUp(ctx)
22 |
23 | for {
24 | select {
25 | case <-ctx.Done():
26 | return
27 | default:
28 | fetches := client.PollFetches(ctx)
29 | errors := fetches.Errors()
30 | for _, err := range errors {
31 | // Log all errors and continue afterwards as we might get errors and still have some fetch results
32 | s.logger.Error("failed to fetch records from kafka",
33 | zap.String("topic", err.Topic),
34 | zap.Int32("partition", err.Partition),
35 | zap.Error(err.Err))
36 | }
37 |
38 | iter := fetches.RecordIter()
39 | for !iter.Done() {
40 | record := iter.Next()
41 | s.storage.markRecordConsumed(record)
42 |
43 | err := s.decodeOffsetRecord(record)
44 | if err != nil {
45 | s.logger.Warn("failed to decode offset record", zap.Error(err))
46 | }
47 | }
48 | }
49 | }
50 | }
51 |
52 | // checkIfConsumerLagIsCaughtUp fetches the newest partition offsets for all partitions in the __consumer_offsets
53 | // topic and compares these against the last consumed messages from our offset consumer. If the consumed offsets are
54 | // higher than the partition offsets this means we caught up the initial lag and can mark our storage as ready. A ready
55 | // store will start to expose consumer group offsets.
56 | func (s *Service) checkIfConsumerLagIsCaughtUp(ctx context.Context) {
57 | for {
58 | time.Sleep(12 * time.Second)
59 | s.logger.Debug("checking if lag in consumer offsets metadataReqTopic is caught up")
60 |
61 | // 1. Get metadataReqTopic high watermarks for __consumer_offsets metadataReqTopic
62 | metadataReq := kmsg.NewMetadataRequest()
63 | metadataReqTopic := kmsg.NewMetadataRequestTopic()
64 | topicName := "__consumer_offsets"
65 | metadataReqTopic.Topic = &topicName
66 | metadataReq.Topics = []kmsg.MetadataRequestTopic{metadataReqTopic}
67 |
68 | res, err := metadataReq.RequestWith(ctx, s.client)
69 | if err != nil {
70 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because metadata request failed",
71 | zap.Error(err))
72 | continue
73 | }
74 |
75 | // 2. Request high watermarks for consumer offset partitions
76 | topicReqs := make([]kmsg.ListOffsetsRequestTopic, len(res.Topics))
77 | for i, topic := range res.Topics {
78 | req := kmsg.NewListOffsetsRequestTopic()
79 | req.Topic = *topic.Topic
80 |
81 | partitionReqs := make([]kmsg.ListOffsetsRequestTopicPartition, len(topic.Partitions))
82 | for j, partition := range topic.Partitions {
83 | partitionReqs[j] = kmsg.NewListOffsetsRequestTopicPartition()
84 | partitionReqs[j].Partition = partition.Partition
85 | partitionReqs[j].Timestamp = -1 // Newest
86 | }
87 | req.Partitions = partitionReqs
88 |
89 | topicReqs[i] = req
90 | }
91 | offsetReq := kmsg.NewListOffsetsRequest()
92 | offsetReq.Topics = topicReqs
93 | highMarksRes, err := offsetReq.RequestWith(ctx, s.client)
94 | if err != nil {
95 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high watermark request failed",
96 | zap.Error(err))
97 | continue
98 | }
99 | if len(highMarksRes.Topics) != 1 {
100 | s.logger.Error("expected exactly one metadataReqTopic response for high water mark request")
101 | continue
102 | }
103 |
104 | // 3. Check if high watermarks have been consumed. To avoid a race condition here we will wait some time before
105 | // comparing, so that the consumer has enough time to catch up to the new high watermarks we just fetched.
106 | time.Sleep(3 * time.Second)
107 | consumedOffsets := s.storage.getConsumedOffsets()
108 | topicRes := highMarksRes.Topics[0]
109 | isReady := true
110 |
111 | type laggingParition struct {
112 | Name string
113 | Id int32
114 | Lag int64
115 | }
116 | var partitionsLagging []laggingParition
117 | totalLag := int64(0)
118 | for _, partition := range topicRes.Partitions {
119 | err := kerr.ErrorForCode(partition.ErrorCode)
120 | if err != nil {
121 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high "+
122 | "watermark request failed, with an inner error",
123 | zap.Error(err))
124 | }
125 |
126 | highWaterMark := partition.Offset - 1
127 | consumedOffset := consumedOffsets[partition.Partition]
128 | partitionLag := highWaterMark - consumedOffset
129 | if partitionLag < 0 {
130 | partitionLag = 0
131 | }
132 |
133 | if partitionLag > 0 {
134 | partitionsLagging = append(partitionsLagging, laggingParition{
135 | Name: topicRes.Topic,
136 | Id: partition.Partition,
137 | Lag: partitionLag,
138 | })
139 | totalLag += partitionLag
140 | s.logger.Debug("consumer_offsets metadataReqTopic lag has not been caught up yet",
141 | zap.Int32("partition_id", partition.Partition),
142 | zap.Int64("high_water_mark", highWaterMark),
143 | zap.Int64("consumed_offset", consumedOffset),
144 | zap.Int64("partition_lag", partitionLag))
145 | isReady = false
146 | continue
147 | }
148 | }
149 | if isReady {
150 | s.logger.Info("successfully consumed all consumer offsets. consumer group lags will be exported from now on")
151 | s.storage.setReadyState(true)
152 | return
153 | } else {
154 | s.logger.Info("catching up the message lag on consumer offsets",
155 | zap.Int("lagging_partitions_count", len(partitionsLagging)),
156 | zap.Any("lagging_partitions", partitionsLagging),
157 | zap.Int64("total_lag", totalLag))
158 | }
159 | }
160 | }
161 |
162 | // decodeOffsetRecord decodes all messages in the consumer offsets topic by routing records to the correct decoding
163 | // method.
164 | func (s *Service) decodeOffsetRecord(record *kgo.Record) error {
165 | if len(record.Key) < 2 {
166 | return fmt.Errorf("offset commit key is supposed to be at least 2 bytes long")
167 | }
168 | messageVer := (&kbin.Reader{Src: record.Key}).Int16()
169 |
170 | switch messageVer {
171 | case 0, 1:
172 | err := s.decodeOffsetCommit(record)
173 | if err != nil {
174 | return err
175 | }
176 | case 2:
177 | err := s.decodeOffsetMetadata(record)
178 | if err != nil {
179 | return err
180 | }
181 | }
182 |
183 | return nil
184 | }
185 |
186 | // decodeOffsetMetadata decodes to metadata which includes the following information:
187 | // - group
188 | // - protocolType (connect/consumer/...)
189 | // - generation
190 | // - protocol
191 | // - currentStateTimestamp
192 | // - groupMembers (member metadata such aus: memberId, groupInstanceId, clientId, clientHost, rebalanceTimeout, ...)
193 | func (s *Service) decodeOffsetMetadata(record *kgo.Record) error {
194 | childLogger := s.logger.With(
195 | zap.String("topic", record.Topic),
196 | zap.Int32("partition_id", record.Partition),
197 | zap.Int64("offset", record.Offset))
198 |
199 | metadataKey := kmsg.NewGroupMetadataKey()
200 | err := metadataKey.ReadFrom(record.Key)
201 | if err != nil {
202 | childLogger.Warn("failed to decode offset metadata key", zap.Error(err))
203 | return fmt.Errorf("failed to decode offset metadata key: %w", err)
204 | }
205 |
206 | if record.Value == nil {
207 | return nil
208 | }
209 | metadataValue := kmsg.NewGroupMetadataValue()
210 | err = metadataValue.ReadFrom(record.Value)
211 | if err != nil {
212 | childLogger.Warn("failed to decode offset metadata value", zap.Error(err))
213 | return fmt.Errorf("failed to decode offset metadata value: %w", err)
214 | }
215 |
216 | return nil
217 | }
218 |
219 | // decodeOffsetCommit decodes to group offsets which include the following information:
220 | // - group, topic, partition
221 | // - offset
222 | // - leaderEpoch
223 | // - metadata (user specified string for each offset commit)
224 | // - commitTimestamp
225 | // - expireTimestamp (only version 1 offset commits / deprecated)
226 | func (s *Service) decodeOffsetCommit(record *kgo.Record) error {
227 | childLogger := s.logger.With(
228 | zap.String("topic", record.Topic),
229 | zap.Int32("partition_id", record.Partition),
230 | zap.Int64("offset", record.Offset))
231 | offsetCommitKey := kmsg.NewOffsetCommitKey()
232 | err := offsetCommitKey.ReadFrom(record.Key)
233 | if err != nil {
234 | childLogger.Warn("failed to decode offset commit key", zap.Error(err))
235 | return fmt.Errorf("failed to decode offset commit key: %w", err)
236 | }
237 |
238 | if record.Value == nil {
239 | // Tombstone - The group offset is expired or no longer valid (e.g. because the topic has been deleted)
240 | s.storage.deleteOffsetCommit(offsetCommitKey)
241 | return nil
242 | }
243 |
244 | offsetCommitValue := kmsg.NewOffsetCommitValue()
245 | err = offsetCommitValue.ReadFrom(record.Value)
246 | if err != nil {
247 | childLogger.Warn("failed to decode offset commit value", zap.Error(err))
248 | return fmt.Errorf("failed to decode offset commit value: %w", err)
249 | }
250 | s.storage.addOffsetCommit(offsetCommitKey, offsetCommitValue)
251 |
252 | return nil
253 | }
254 |
255 | func (s *Service) GetNumberOfOffsetRecordsConsumed() float64 {
256 | return s.storage.getNumberOfConsumedRecords()
257 | }
258 |
--------------------------------------------------------------------------------
/minion/service.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "net/http"
8 | "regexp"
9 | "strings"
10 | "sync"
11 | "time"
12 |
13 | "github.com/twmb/franz-go/pkg/kadm"
14 | "github.com/twmb/franz-go/pkg/kgo"
15 | "github.com/twmb/franz-go/pkg/kmsg"
16 | "github.com/twmb/franz-go/pkg/kversion"
17 | "go.uber.org/zap"
18 | "golang.org/x/sync/singleflight"
19 |
20 | "github.com/cloudhut/kminion/v2/kafka"
21 | )
22 |
23 | type Service struct {
24 | Cfg Config
25 | logger *zap.Logger
26 |
27 | // requestGroup is used to deduplicate multiple concurrent requests to kafka
28 | requestGroup *singleflight.Group
29 | cache map[string]interface{}
30 | cacheLock sync.RWMutex
31 |
32 | AllowedGroupIDsExpr []*regexp.Regexp
33 | IgnoredGroupIDsExpr []*regexp.Regexp
34 | AllowedTopicsExpr []*regexp.Regexp
35 | IgnoredTopicsExpr []*regexp.Regexp
36 |
37 | client *kgo.Client
38 | admClient *kadm.Client
39 | storage *Storage
40 | }
41 |
42 | func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricsNamespace string, ctx context.Context) (*Service, error) {
43 | storage, err := newStorage(logger)
44 | if err != nil {
45 | return nil, fmt.Errorf("failed to create storage: %w", err)
46 | }
47 |
48 | // Kafka client
49 | minionHooks := newMinionClientHooks(logger.Named("kafka_hooks"), metricsNamespace)
50 | kgoOpts := []kgo.Opt{
51 | kgo.WithHooks(minionHooks),
52 | }
53 | if cfg.ConsumerGroups.Enabled && cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic {
54 | kgoOpts = append(kgoOpts,
55 | kgo.ConsumeResetOffset(kgo.NewOffset().AtStart()),
56 | kgo.ConsumeTopics("__consumer_offsets"))
57 | }
58 |
59 | logger.Info("connecting to Kafka seed brokers, trying to fetch cluster metadata",
60 | zap.String("seed_brokers", strings.Join(kafkaSvc.Brokers(), ",")))
61 |
62 | client, err := kafkaSvc.CreateAndTestClient(ctx, logger, kgoOpts)
63 | if err != nil {
64 | return nil, fmt.Errorf("failed to create kafka client: %w", err)
65 | }
66 | logger.Info("successfully connected to kafka cluster")
67 |
68 | // Compile regexes. We can ignore the errors because valid compilation has been validated already
69 | allowedGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.AllowedGroupIDs)
70 | ignoredGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.IgnoredGroupIDs)
71 | allowedTopicsExpr, _ := compileRegexes(cfg.Topics.AllowedTopics)
72 | ignoredTopicsExpr, _ := compileRegexes(cfg.Topics.IgnoredTopics)
73 |
74 | service := &Service{
75 | Cfg: cfg,
76 | logger: logger.Named("minion_service"),
77 |
78 | requestGroup: &singleflight.Group{},
79 | cache: make(map[string]interface{}),
80 | cacheLock: sync.RWMutex{},
81 |
82 | AllowedGroupIDsExpr: allowedGroupIDsExpr,
83 | IgnoredGroupIDsExpr: ignoredGroupIDsExpr,
84 | AllowedTopicsExpr: allowedTopicsExpr,
85 | IgnoredTopicsExpr: ignoredTopicsExpr,
86 |
87 | client: client,
88 | admClient: kadm.NewClient(client),
89 |
90 | storage: storage,
91 | }
92 |
93 | return service, nil
94 | }
95 |
96 | func (s *Service) Start(ctx context.Context) error {
97 | err := s.ensureCompatibility(ctx)
98 | if err != nil {
99 | return fmt.Errorf("failed to check feature compatibility against Kafka: %w", err)
100 | }
101 |
102 | if s.Cfg.ConsumerGroups.Enabled && s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic {
103 | go s.startConsumingOffsets(ctx)
104 | }
105 |
106 | return nil
107 | }
108 |
109 | func (s *Service) isReady() bool {
110 | if s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeAdminAPI {
111 | return true
112 | }
113 |
114 | return s.storage.isReady()
115 | }
116 |
117 | func (s *Service) HandleIsReady() http.HandlerFunc {
118 | type response struct {
119 | StatusCode int `json:"statusCode"`
120 | }
121 | return func(w http.ResponseWriter, r *http.Request) {
122 | status := http.StatusOK
123 | if !s.isReady() {
124 | status = http.StatusServiceUnavailable
125 | }
126 | res := response{StatusCode: status}
127 | resJson, _ := json.Marshal(res)
128 | w.WriteHeader(status)
129 | w.Write(resJson)
130 | }
131 | }
132 |
133 | // ensureCompatibility checks whether the options as configured are available in the connected cluster. For example
134 | // we will check if the target Kafka's API version support the LogDirs request. If that's not the case we will
135 | // disable the option and print a warning message.
136 | func (s *Service) ensureCompatibility(ctx context.Context) error {
137 | ctx, cancel := context.WithTimeout(ctx, 15*time.Second)
138 | defer cancel()
139 | versionsRes, err := s.GetAPIVersions(ctx)
140 | if err != nil {
141 | return fmt.Errorf("kafka api versions couldn't be fetched: %w", err)
142 | }
143 | versions := kversion.FromApiVersionsResponse(versionsRes)
144 |
145 | // Check Describe Log Dirs
146 | if s.Cfg.LogDirs.Enabled {
147 | k := kmsg.NewDescribeLogDirsRequest()
148 | isSupported := versions.HasKey(k.Key())
149 | if !isSupported {
150 | s.logger.Warn("describing log dirs is enabled, but it is not supported because your Kafka cluster " +
151 | "version is too old. feature will be disabled")
152 | s.Cfg.LogDirs.Enabled = false
153 | }
154 | }
155 |
156 | return nil
157 | }
158 |
159 | func (s *Service) getCachedItem(key string) (interface{}, bool) {
160 | s.cacheLock.RLock()
161 | defer s.cacheLock.RUnlock()
162 |
163 | val, exists := s.cache[key]
164 | return val, exists
165 | }
166 |
167 | func (s *Service) setCachedItem(key string, val interface{}, timeout time.Duration) {
168 | s.cacheLock.Lock()
169 | defer s.cacheLock.Unlock()
170 |
171 | go func() {
172 | time.Sleep(timeout)
173 | s.deleteCachedItem(key)
174 | }()
175 |
176 | s.cache[key] = val
177 | }
178 |
179 | func (s *Service) deleteCachedItem(key string) {
180 | s.cacheLock.Lock()
181 | defer s.cacheLock.Unlock()
182 |
183 | delete(s.cache, key)
184 | }
185 |
--------------------------------------------------------------------------------
/minion/storage.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "fmt"
5 | "strconv"
6 | "time"
7 |
8 | cmap "github.com/orcaman/concurrent-map"
9 | "github.com/twmb/franz-go/pkg/kgo"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/atomic"
12 | "go.uber.org/zap"
13 | )
14 |
15 | // Storage stores the current state of all consumer group information that has been consumed using the offset consumer.
16 | type Storage struct {
17 | logger *zap.Logger
18 |
19 | // offsetCommits is a map of all consumer offsets.
20 | // A unique key in the format "group:topic:partition" is used as map key.
21 | // Value is of type OffsetCommit
22 | offsetCommits cmap.ConcurrentMap
23 |
24 | // progressTracker is a map that tracks what offsets in each partition have already been consumed
25 | progressTracker cmap.ConcurrentMap
26 |
27 | isReadyBool *atomic.Bool
28 |
29 | // Number of consumed records (used for a Prometheus metric)
30 | consumedRecords *atomic.Float64
31 | }
32 |
33 | // OffsetCommit is used as value for the OffsetCommit map
34 | type OffsetCommit struct {
35 | Key kmsg.OffsetCommitKey
36 | Value kmsg.OffsetCommitValue
37 |
38 | // CommitCount is the number of offset commits for this group-topic-partition combination
39 | CommitCount int
40 |
41 | // ExpireTimestamp is a timestamp that indicates when this offset commit will expire on the Kafka cluster
42 | ExpireTimestamp time.Time
43 | }
44 |
45 | func newStorage(logger *zap.Logger) (*Storage, error) {
46 | return &Storage{
47 | logger: logger.Named("storage"),
48 | offsetCommits: cmap.New(),
49 | progressTracker: cmap.New(),
50 | isReadyBool: atomic.NewBool(false),
51 | consumedRecords: atomic.NewFloat64(0),
52 | }, nil
53 | }
54 |
55 | func (s *Storage) isReady() bool {
56 | return s.isReadyBool.Load()
57 | }
58 |
59 | func (s *Storage) setReadyState(isReady bool) {
60 | s.isReadyBool.Store(isReady)
61 | }
62 |
63 | // markRecordConsumed stores the latest consumed offset for each partition. This is necessary in order to figure out
64 | // whether we have caught up the message lag when starting KMinion as we start consuming from the very oldest offset
65 | // commit.
66 | func (s *Storage) markRecordConsumed(rec *kgo.Record) {
67 | key := fmt.Sprintf("%v", rec.Partition)
68 | s.progressTracker.Set(key, rec.Offset)
69 | s.consumedRecords.Add(1)
70 | }
71 |
72 | func (s *Storage) addOffsetCommit(key kmsg.OffsetCommitKey, value kmsg.OffsetCommitValue) {
73 | // For performance reasons we'll store offset commits using a "unique key". Writes happen way more frequently than
74 | // reads (Prometheus scraping the endpoint). Hence we can group everything by group or topic on the read path as
75 | // needed instead of writing it into nested maps like a map[GroupID]map[Topic]map[Partition]
76 | uniqueKey := encodeOffsetCommitKey(key)
77 |
78 | commitCount := 0
79 | commitInterface, exists := s.offsetCommits.Get(uniqueKey)
80 | if exists {
81 | offsetCommit := commitInterface.(OffsetCommit)
82 | commitCount = offsetCommit.CommitCount
83 | }
84 |
85 | timeDay := 24 * time.Hour
86 | commit := OffsetCommit{
87 | Key: key,
88 | Value: value,
89 | CommitCount: commitCount + 1,
90 | ExpireTimestamp: time.Unix(0, value.CommitTimestamp*int64(time.Millisecond)).Add(7 * timeDay),
91 | }
92 | s.offsetCommits.Set(uniqueKey, commit)
93 | }
94 |
95 | func (s *Storage) getConsumedOffsets() map[int32]int64 {
96 | offsetsByPartition := make(map[int32]int64)
97 | offsets := s.progressTracker.Items()
98 | for partitionID, offsetStr := range offsets {
99 | val := offsetStr.(int64)
100 | partitionID, _ := strconv.ParseInt(partitionID, 10, 32)
101 | offsetsByPartition[int32(partitionID)] = val
102 | }
103 |
104 | return offsetsByPartition
105 | }
106 |
107 | func (s *Storage) getNumberOfConsumedRecords() float64 {
108 | return s.consumedRecords.Load()
109 | }
110 |
111 | func (s *Storage) getGroupOffsets() map[string]map[string]map[int32]OffsetCommit {
112 | // Offsets by group, topic, partition
113 | offsetsByGroup := make(map[string]map[string]map[int32]OffsetCommit)
114 |
115 | if !s.isReady() {
116 | s.logger.Info("Tried to fetch consumer group offsets, but haven't consumed the whole topic yet")
117 | return offsetsByGroup
118 | }
119 |
120 | offsets := s.offsetCommits.Items()
121 | for _, offset := range offsets {
122 | val := offset.(OffsetCommit)
123 |
124 | // Initialize inner maps as necessary
125 | if _, exists := offsetsByGroup[val.Key.Group]; !exists {
126 | offsetsByGroup[val.Key.Group] = make(map[string]map[int32]OffsetCommit)
127 | }
128 | if _, exists := offsetsByGroup[val.Key.Group][val.Key.Topic]; !exists {
129 | offsetsByGroup[val.Key.Group][val.Key.Topic] = make(map[int32]OffsetCommit)
130 | }
131 |
132 | offsetsByGroup[val.Key.Group][val.Key.Topic][val.Key.Partition] = val
133 | }
134 |
135 | return offsetsByGroup
136 | }
137 |
138 | func (s *Storage) deleteOffsetCommit(key kmsg.OffsetCommitKey) {
139 | uniqueKey := encodeOffsetCommitKey(key)
140 | s.offsetCommits.Remove(uniqueKey)
141 | }
142 |
143 | func encodeOffsetCommitKey(key kmsg.OffsetCommitKey) string {
144 | return fmt.Sprintf("%v:%v:%v", key.Group, key.Topic, key.Partition)
145 | }
146 |
--------------------------------------------------------------------------------
/minion/utils.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 | "strings"
7 | )
8 |
9 | func (s *Service) IsGroupAllowed(groupName string) bool {
10 | isAllowed := false
11 | for _, regex := range s.AllowedGroupIDsExpr {
12 | if regex.MatchString(groupName) {
13 | isAllowed = true
14 | break
15 | }
16 | }
17 |
18 | for _, regex := range s.IgnoredGroupIDsExpr {
19 | if regex.MatchString(groupName) {
20 | isAllowed = false
21 | break
22 | }
23 | }
24 | return isAllowed
25 | }
26 |
27 | func (s *Service) IsTopicAllowed(topicName string) bool {
28 | isAllowed := false
29 | for _, regex := range s.AllowedTopicsExpr {
30 | if regex.MatchString(topicName) {
31 | isAllowed = true
32 | break
33 | }
34 | }
35 |
36 | for _, regex := range s.IgnoredTopicsExpr {
37 | if regex.MatchString(topicName) {
38 | isAllowed = false
39 | break
40 | }
41 | }
42 | return isAllowed
43 | }
44 |
45 | func compileRegex(expr string) (*regexp.Regexp, error) {
46 | if strings.HasPrefix(expr, "/") && strings.HasSuffix(expr, "/") {
47 | substr := expr[1 : len(expr)-1]
48 | regex, err := regexp.Compile(substr)
49 | if err != nil {
50 | return nil, err
51 | }
52 |
53 | return regex, nil
54 | }
55 |
56 | // If this is no regex input (which is marked by the slashes around it) then we escape it so that it's a literal
57 | regex, err := regexp.Compile("^" + expr + "$")
58 | if err != nil {
59 | return nil, err
60 | }
61 | return regex, nil
62 | }
63 |
64 | func compileRegexes(expr []string) ([]*regexp.Regexp, error) {
65 | compiledExpressions := make([]*regexp.Regexp, len(expr))
66 | for i, exprStr := range expr {
67 | expr, err := compileRegex(exprStr)
68 | if err != nil {
69 | return nil, fmt.Errorf("failed to compile expression string '%v': %w", exprStr, err)
70 | }
71 | compiledExpressions[i] = expr
72 | }
73 |
74 | return compiledExpressions, nil
75 | }
76 |
--------------------------------------------------------------------------------
/minion/versions.go:
--------------------------------------------------------------------------------
1 | package minion
2 |
3 | import (
4 | "context"
5 | "fmt"
6 |
7 | "github.com/twmb/franz-go/pkg/kerr"
8 | "github.com/twmb/franz-go/pkg/kmsg"
9 | "github.com/twmb/franz-go/pkg/kversion"
10 | )
11 |
12 | func (s *Service) GetClusterVersion(ctx context.Context) (string, error) {
13 | res, err := s.GetAPIVersions(ctx)
14 | if err != nil {
15 | return "", err
16 | }
17 |
18 | versions := kversion.FromApiVersionsResponse(res)
19 | return versions.VersionGuess(), nil
20 | }
21 |
22 | func (s *Service) GetAPIVersions(ctx context.Context) (*kmsg.ApiVersionsResponse, error) {
23 | versionsReq := kmsg.NewApiVersionsRequest()
24 | versionsReq.ClientSoftwareName = "kminion"
25 | versionsReq.ClientSoftwareVersion = "v2"
26 | res, err := versionsReq.RequestWith(ctx, s.client)
27 | if err != nil {
28 | return nil, fmt.Errorf("failed to request api versions: %w", err)
29 | }
30 |
31 | err = kerr.ErrorForCode(res.ErrorCode)
32 | if err != nil {
33 | return nil, fmt.Errorf("failed to request api versions. Inner kafka error: %w", err)
34 | }
35 |
36 | return res, nil
37 | }
38 |
--------------------------------------------------------------------------------
/prometheus/collect_broker_info.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "github.com/prometheus/client_golang/prometheus"
6 | "go.uber.org/zap"
7 | "strconv"
8 | )
9 |
10 | func (e *Exporter) collectBrokerInfo(ctx context.Context, ch chan<- prometheus.Metric) bool {
11 | metadata, err := e.minionSvc.GetMetadataCached(ctx)
12 | if err != nil {
13 | e.logger.Error("failed to get kafka metadata", zap.Error(err))
14 | return false
15 | }
16 |
17 | for _, broker := range metadata.Brokers {
18 | rack := ""
19 | if broker.Rack != nil {
20 | rack = *broker.Rack
21 | }
22 |
23 | isController := metadata.ControllerID == broker.NodeID
24 | ch <- prometheus.MustNewConstMetric(
25 | e.brokerInfo,
26 | prometheus.GaugeValue,
27 | 1,
28 | strconv.Itoa(int(broker.NodeID)),
29 | broker.Host,
30 | strconv.Itoa(int(broker.Port)),
31 | rack,
32 | strconv.FormatBool(isController),
33 | )
34 | }
35 |
36 | return true
37 | }
38 |
--------------------------------------------------------------------------------
/prometheus/collect_cluster_info.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "github.com/prometheus/client_golang/prometheus"
6 | "go.uber.org/zap"
7 | "strconv"
8 | )
9 |
10 | func (e *Exporter) collectClusterInfo(ctx context.Context, ch chan<- prometheus.Metric) bool {
11 | version, err := e.minionSvc.GetClusterVersion(ctx)
12 | if err != nil {
13 | e.logger.Error("failed to get kafka cluster version", zap.Error(err))
14 | return false
15 | }
16 |
17 | metadata, err := e.minionSvc.GetMetadataCached(ctx)
18 | if err != nil {
19 | e.logger.Error("failed to get kafka metadata", zap.Error(err))
20 | return false
21 | }
22 | brokerCount := len(metadata.Brokers)
23 | clusterID := ""
24 | if metadata.ClusterID != nil {
25 | clusterID = *metadata.ClusterID
26 | }
27 |
28 | ch <- prometheus.MustNewConstMetric(
29 | e.clusterInfo,
30 | prometheus.GaugeValue,
31 | 1,
32 | version,
33 | strconv.Itoa(brokerCount),
34 | strconv.Itoa(int(metadata.ControllerID)),
35 | clusterID,
36 | )
37 | return true
38 | }
39 |
--------------------------------------------------------------------------------
/prometheus/collect_consumer_group_lags.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "math"
6 | "strconv"
7 |
8 | "github.com/prometheus/client_golang/prometheus"
9 | "github.com/twmb/franz-go/pkg/kadm"
10 | "github.com/twmb/franz-go/pkg/kerr"
11 | "go.uber.org/zap"
12 |
13 | "github.com/cloudhut/kminion/v2/minion"
14 | )
15 |
16 | type waterMark struct {
17 | TopicName string
18 | PartitionID int32
19 | LowWaterMark int64
20 | HighWaterMark int64
21 | }
22 |
23 | func (e *Exporter) collectConsumerGroupLags(ctx context.Context, ch chan<- prometheus.Metric) bool {
24 | if !e.minionSvc.Cfg.ConsumerGroups.Enabled {
25 | return true
26 | }
27 |
28 | // Low Watermarks (at the moment they are not needed at all, they could be used to calculate the lag on partitions
29 | // that don't have any active offsets)
30 | lowWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -2)
31 | if err != nil {
32 | e.logger.Error("failed to fetch low water marks", zap.Error(err))
33 | return false
34 | }
35 | // High Watermarks
36 | highWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -1)
37 | if err != nil {
38 | e.logger.Error("failed to fetch low water marks", zap.Error(err))
39 | return false
40 | }
41 | waterMarksByTopic := e.waterMarksByTopic(lowWaterMarks, highWaterMarks)
42 |
43 | // We have two different options to get consumer group offsets - either via the AdminAPI or by consuming the
44 | // __consumer_offsets topic.
45 | if e.minionSvc.Cfg.ConsumerGroups.ScrapeMode == minion.ConsumerGroupScrapeModeAdminAPI {
46 | return e.collectConsumerGroupLagsAdminAPI(ctx, ch, waterMarksByTopic)
47 | } else {
48 | return e.collectConsumerGroupLagsOffsetTopic(ctx, ch, waterMarksByTopic)
49 | }
50 | }
51 |
52 | func (e *Exporter) collectConsumerGroupLagsOffsetTopic(_ context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool {
53 | offsets := e.minionSvc.ListAllConsumerGroupOffsetsInternal()
54 | for groupName, group := range offsets {
55 | if !e.minionSvc.IsGroupAllowed(groupName) {
56 | continue
57 | }
58 | offsetCommits := 0
59 |
60 | for topicName, topic := range group {
61 | topicLag := float64(0)
62 | topicOffsetSum := float64(0)
63 | for partitionID, partition := range topic {
64 | childLogger := e.logger.With(
65 | zap.String("consumer_group", groupName),
66 | zap.String("topic_name", topicName),
67 | zap.Int32("partition_id", partitionID),
68 | zap.Int64("group_offset", partition.Value.Offset))
69 |
70 | topicMark, exists := marks[topicName]
71 | if !exists {
72 | childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for")
73 | break // We can stop trying to find any other offsets for that topic so let's quit this loop
74 | }
75 | partitionMark, exists := topicMark[partitionID]
76 | if !exists {
77 | childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for")
78 | continue
79 | }
80 | lag := float64(partitionMark.HighWaterMark - partition.Value.Offset)
81 | // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a
82 | // race condition. Negative lags obviously do not make sense so use at least 0 as lag.
83 | lag = math.Max(0, lag)
84 | topicLag += lag
85 | topicOffsetSum += float64(partition.Value.Offset)
86 |
87 | // Offset commit count for this consumer group
88 | offsetCommits += partition.CommitCount
89 |
90 | if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic {
91 | continue
92 | }
93 | ch <- prometheus.MustNewConstMetric(
94 | e.consumerGroupTopicPartitionLag,
95 | prometheus.GaugeValue,
96 | lag,
97 | groupName,
98 | topicName,
99 | strconv.Itoa(int(partitionID)),
100 | )
101 | }
102 | ch <- prometheus.MustNewConstMetric(
103 | e.consumerGroupTopicLag,
104 | prometheus.GaugeValue,
105 | topicLag,
106 | groupName,
107 | topicName,
108 | )
109 | ch <- prometheus.MustNewConstMetric(
110 | e.consumerGroupTopicOffsetSum,
111 | prometheus.GaugeValue,
112 | topicOffsetSum,
113 | groupName,
114 | topicName,
115 | )
116 | }
117 |
118 | ch <- prometheus.MustNewConstMetric(
119 | e.offsetCommits,
120 | prometheus.CounterValue,
121 | float64(offsetCommits),
122 | groupName,
123 | )
124 | }
125 | return true
126 | }
127 |
128 | func (e *Exporter) collectConsumerGroupLagsAdminAPI(ctx context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool {
129 | isOk := true
130 |
131 | groupOffsets, err := e.minionSvc.ListAllConsumerGroupOffsetsAdminAPI(ctx)
132 | for groupName, offsetRes := range groupOffsets {
133 | if !e.minionSvc.IsGroupAllowed(groupName) {
134 | continue
135 | }
136 |
137 | err = kerr.ErrorForCode(offsetRes.ErrorCode)
138 | if err != nil {
139 | e.logger.Warn("failed to get offsets from consumer group, inner kafka error",
140 | zap.String("consumer_group", groupName),
141 | zap.Error(err))
142 | isOk = false
143 | continue
144 | }
145 | for _, topic := range offsetRes.Topics {
146 | topicLag := float64(0)
147 | topicOffsetSum := float64(0)
148 | for _, partition := range topic.Partitions {
149 | err := kerr.ErrorForCode(partition.ErrorCode)
150 | if err != nil {
151 | e.logger.Warn("failed to get consumer group offsets for a partition, inner kafka error",
152 | zap.String("consumer_group", groupName),
153 | zap.Error(err))
154 | isOk = false
155 | continue
156 | }
157 |
158 | childLogger := e.logger.With(
159 | zap.String("consumer_group", groupName),
160 | zap.String("topic_name", topic.Topic),
161 | zap.Int32("partition_id", partition.Partition),
162 | zap.Int64("group_offset", partition.Offset))
163 | topicMark, exists := marks[topic.Topic]
164 | if !exists {
165 | childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for")
166 | isOk = false
167 | break // We can stop trying to find any other offsets for that topic so let's quit this loop
168 | }
169 | partitionMark, exists := topicMark[partition.Partition]
170 | if !exists {
171 | childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for")
172 | isOk = false
173 | continue
174 | }
175 | lag := float64(partitionMark.HighWaterMark - partition.Offset)
176 | // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a
177 | // race condition. Negative lags obviously do not make sense so use at least 0 as lag.
178 | lag = math.Max(0, lag)
179 | topicLag += lag
180 | topicOffsetSum += float64(partition.Offset)
181 |
182 | if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic {
183 | continue
184 | }
185 | ch <- prometheus.MustNewConstMetric(
186 | e.consumerGroupTopicPartitionLag,
187 | prometheus.GaugeValue,
188 | lag,
189 | groupName,
190 | topic.Topic,
191 | strconv.Itoa(int(partition.Partition)),
192 | )
193 | }
194 |
195 | ch <- prometheus.MustNewConstMetric(
196 | e.consumerGroupTopicLag,
197 | prometheus.GaugeValue,
198 | topicLag,
199 | groupName,
200 | topic.Topic,
201 | )
202 | ch <- prometheus.MustNewConstMetric(
203 | e.consumerGroupTopicOffsetSum,
204 | prometheus.GaugeValue,
205 | topicOffsetSum,
206 | groupName,
207 | topic.Topic,
208 | )
209 | }
210 | }
211 | return isOk
212 | }
213 |
214 | func (e *Exporter) waterMarksByTopic(lowMarks kadm.ListedOffsets, highMarks kadm.ListedOffsets) map[string]map[int32]waterMark {
215 | type partitionID = int32
216 | type topicName = string
217 | waterMarks := make(map[topicName]map[partitionID]waterMark)
218 |
219 | for topic, lowMarksByPartitionID := range lowMarks {
220 | _, exists := waterMarks[topic]
221 | if !exists {
222 | waterMarks[topic] = make(map[partitionID]waterMark)
223 | }
224 |
225 | for _, lowOffset := range lowMarksByPartitionID {
226 | if lowOffset.Err != nil {
227 | e.logger.Debug("failed to get partition low water mark, inner kafka error",
228 | zap.String("topic_name", lowOffset.Topic),
229 | zap.Int32("partition_id", lowOffset.Partition),
230 | zap.Error(lowOffset.Err))
231 | continue
232 | }
233 |
234 | higOffset, exists := highMarks.Lookup(lowOffset.Topic, lowOffset.Partition)
235 | if !exists {
236 | e.logger.Error("got low water marks for a topic's partition but no high watermarks",
237 | zap.String("topic_name", lowOffset.Topic),
238 | zap.Int32("partition_id", lowOffset.Partition),
239 | zap.Int64("offset", lowOffset.Offset))
240 | delete(waterMarks, lowOffset.Topic)
241 | break // Topic watermarks are invalid -> delete & skip this topic
242 | }
243 | if higOffset.Err != nil {
244 | e.logger.Debug("failed to get partition low water mark, inner kafka error",
245 | zap.String("topic_name", lowOffset.Topic),
246 | zap.Int32("partition_id", lowOffset.Partition),
247 | zap.Error(lowOffset.Err))
248 | continue
249 | }
250 |
251 | waterMarks[lowOffset.Topic][lowOffset.Partition] = waterMark{
252 | TopicName: lowOffset.Topic,
253 | PartitionID: lowOffset.Partition,
254 | LowWaterMark: lowOffset.Offset,
255 | HighWaterMark: higOffset.Offset,
256 | }
257 | }
258 | }
259 |
260 | return waterMarks
261 | }
262 |
--------------------------------------------------------------------------------
/prometheus/collect_consumer_groups.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "strconv"
7 |
8 | "github.com/prometheus/client_golang/prometheus"
9 | "github.com/twmb/franz-go/pkg/kerr"
10 | "github.com/twmb/franz-go/pkg/kmsg"
11 | "go.uber.org/zap"
12 | )
13 |
14 | func (e *Exporter) collectConsumerGroups(ctx context.Context, ch chan<- prometheus.Metric) bool {
15 | if !e.minionSvc.Cfg.ConsumerGroups.Enabled {
16 | return true
17 | }
18 | groups, err := e.minionSvc.DescribeConsumerGroups(ctx)
19 | if err != nil {
20 | e.logger.Error("failed to collect consumer groups, because Kafka request failed", zap.Error(err))
21 | return false
22 | }
23 |
24 | // The list of groups may be incomplete due to group coordinators that might fail to respond. We do log an error
25 | // message in that case (in the kafka request method) and groups will not be included in this list.
26 | for _, grp := range groups {
27 | coordinator := grp.BrokerMetadata.NodeID
28 | for _, group := range grp.Groups.Groups {
29 | err := kerr.ErrorForCode(group.ErrorCode)
30 | if err != nil {
31 | e.logger.Warn("failed to describe consumer group, internal kafka error",
32 | zap.Error(err),
33 | zap.String("group_id", group.Group),
34 | )
35 | continue
36 | }
37 | if !e.minionSvc.IsGroupAllowed(group.Group) {
38 | continue
39 | }
40 | state := 0
41 | if group.State == "Stable" {
42 | state = 1
43 | }
44 | ch <- prometheus.MustNewConstMetric(
45 | e.consumerGroupInfo,
46 | prometheus.GaugeValue,
47 | float64(state),
48 | group.Group,
49 | group.Protocol,
50 | group.ProtocolType,
51 | group.State,
52 | strconv.FormatInt(int64(coordinator), 10),
53 | )
54 |
55 | // total number of members in consumer groups
56 | ch <- prometheus.MustNewConstMetric(
57 | e.consumerGroupMembers,
58 | prometheus.GaugeValue,
59 | float64(len(group.Members)),
60 | group.Group,
61 | )
62 |
63 | // iterate all members and build two maps:
64 | // - {topic -> number-of-consumers}
65 | // - {topic -> number-of-partitions-assigned}
66 | topicConsumers := make(map[string]int)
67 | topicPartitionsAssigned := make(map[string]int)
68 | membersWithEmptyAssignment := 0
69 | failedAssignmentsDecode := 0
70 | for _, member := range group.Members {
71 | if len(member.MemberAssignment) == 0 {
72 | membersWithEmptyAssignment++
73 | continue
74 | }
75 |
76 | kassignment, err := decodeMemberAssignments(group.ProtocolType, member)
77 | if err != nil {
78 | e.logger.Debug("failed to decode consumer group member assignment, internal kafka error",
79 | zap.Error(err),
80 | zap.String("group_id", group.Group),
81 | zap.String("client_id", member.ClientID),
82 | zap.String("member_id", member.MemberID),
83 | zap.String("client_host", member.ClientHost),
84 | )
85 | failedAssignmentsDecode++
86 | continue
87 | }
88 | if kassignment == nil {
89 | // This is expected in the case of protocolTypes that don't provide valuable information
90 | continue
91 | }
92 |
93 | if len(kassignment.Topics) == 0 {
94 | membersWithEmptyAssignment++
95 | }
96 | for _, topic := range kassignment.Topics {
97 | topicConsumers[topic.Topic]++
98 | topicPartitionsAssigned[topic.Topic] += len(topic.Partitions)
99 | }
100 | }
101 |
102 | if failedAssignmentsDecode > 0 {
103 | e.logger.Error("failed to decode consumer group member assignment, internal kafka error",
104 | zap.Error(err),
105 | zap.String("group_id", group.Group),
106 | zap.Int("assignment_decode_failures", failedAssignmentsDecode),
107 | )
108 | }
109 |
110 | // number of members with no assignment in a stable consumer group
111 | if membersWithEmptyAssignment > 0 {
112 | ch <- prometheus.MustNewConstMetric(
113 | e.consumerGroupMembersEmpty,
114 | prometheus.GaugeValue,
115 | float64(membersWithEmptyAssignment),
116 | group.Group,
117 | )
118 | }
119 | // number of members in consumer groups for each topic
120 | for topicName, consumers := range topicConsumers {
121 | ch <- prometheus.MustNewConstMetric(
122 | e.consumerGroupTopicMembers,
123 | prometheus.GaugeValue,
124 | float64(consumers),
125 | group.Group,
126 | topicName,
127 | )
128 | }
129 | // number of partitions assigned in consumer groups for each topic
130 | for topicName, partitions := range topicPartitionsAssigned {
131 | ch <- prometheus.MustNewConstMetric(
132 | e.consumerGroupAssignedTopicPartitions,
133 | prometheus.GaugeValue,
134 | float64(partitions),
135 | group.Group,
136 | topicName,
137 | )
138 | }
139 | }
140 | }
141 | return true
142 | }
143 |
144 | func decodeMemberAssignments(protocolType string, member kmsg.DescribeGroupsResponseGroupMember) (*kmsg.ConsumerMemberAssignment, error) {
145 | switch protocolType {
146 | case "consumer":
147 | a := kmsg.NewConsumerMemberAssignment()
148 | if err := a.ReadFrom(member.MemberAssignment); err != nil {
149 | return nil, fmt.Errorf("failed to decode member assignment: %w", err)
150 | }
151 | return &a, nil
152 | case "connect":
153 | return nil, nil
154 | default:
155 | return nil, nil
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/prometheus/collect_exporter_metrics.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "github.com/prometheus/client_golang/prometheus"
6 | )
7 |
8 | func (e *Exporter) collectExporterMetrics(_ context.Context, ch chan<- prometheus.Metric) bool {
9 | recordsConsumed := e.minionSvc.GetNumberOfOffsetRecordsConsumed()
10 | ch <- prometheus.MustNewConstMetric(
11 | e.offsetConsumerRecordsConsumed,
12 | prometheus.CounterValue,
13 | recordsConsumed,
14 | )
15 | return true
16 | }
17 |
--------------------------------------------------------------------------------
/prometheus/collect_log_dirs.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "github.com/prometheus/client_golang/prometheus"
6 | "github.com/twmb/franz-go/pkg/kerr"
7 | "github.com/twmb/franz-go/pkg/kgo"
8 | "go.uber.org/zap"
9 | "strconv"
10 | )
11 |
12 | func (e *Exporter) collectLogDirs(ctx context.Context, ch chan<- prometheus.Metric) bool {
13 | if !e.minionSvc.Cfg.LogDirs.Enabled {
14 | return true
15 | }
16 | isOk := true
17 |
18 | sizeByBroker := make(map[kgo.BrokerMetadata]int64)
19 | sizeByTopicName := make(map[string]int64)
20 |
21 | logDirsSharded := e.minionSvc.DescribeLogDirs(ctx)
22 | for _, logDirRes := range logDirsSharded {
23 | childLogger := e.logger.With(zap.String("broker_address", logDirRes.Broker.Host),
24 | zap.String("broker_id", strconv.Itoa(int(logDirRes.Broker.NodeID))))
25 |
26 | if logDirRes.Err != nil {
27 | childLogger.Error("failed to describe a broker's log dirs", zap.Error(logDirRes.Err))
28 | isOk = false
29 | continue
30 | }
31 |
32 | for _, dir := range logDirRes.LogDirs.Dirs {
33 | err := kerr.ErrorForCode(dir.ErrorCode)
34 | if err != nil {
35 | childLogger.Error("failed to describe a broker's log dir",
36 | zap.String("log_dir", dir.Dir),
37 | zap.Error(err))
38 | isOk = false
39 | continue
40 | }
41 | for _, topic := range dir.Topics {
42 | topicSize := int64(0)
43 | for _, partition := range topic.Partitions {
44 | topicSize += partition.Size
45 | }
46 | sizeByTopicName[topic.Topic] += topicSize
47 | sizeByBroker[logDirRes.Broker] += topicSize
48 | }
49 | }
50 | }
51 |
52 | // Report the total log dir size per broker
53 | for broker, size := range sizeByBroker {
54 | rackID := ""
55 | if broker.Rack != nil {
56 | rackID = *broker.Rack
57 | }
58 | ch <- prometheus.MustNewConstMetric(
59 | e.brokerLogDirSize,
60 | prometheus.GaugeValue,
61 | float64(size),
62 | strconv.Itoa(int(broker.NodeID)),
63 | broker.Host,
64 | strconv.Itoa(int(broker.Port)),
65 | rackID,
66 | )
67 | }
68 |
69 | // If one of the log dir responses returned an error we can not reliably report the topic log dirs, as there might
70 | // be additional data on the brokers that failed to respond.
71 | if !isOk {
72 | return false
73 | }
74 |
75 | // Report the total log dir size per topic
76 | for topicName, size := range sizeByTopicName {
77 | ch <- prometheus.MustNewConstMetric(
78 | e.topicLogDirSize,
79 | prometheus.GaugeValue,
80 | float64(size),
81 | topicName,
82 | )
83 | }
84 |
85 | return isOk
86 | }
87 |
--------------------------------------------------------------------------------
/prometheus/collect_topic_info.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "strconv"
6 |
7 | "github.com/prometheus/client_golang/prometheus"
8 | "github.com/twmb/franz-go/pkg/kerr"
9 | "go.uber.org/zap"
10 | )
11 |
12 | func (e *Exporter) collectTopicInfo(ctx context.Context, ch chan<- prometheus.Metric) bool {
13 | if !e.minionSvc.Cfg.Topics.Enabled {
14 | return true
15 | }
16 |
17 | metadata, err := e.minionSvc.GetMetadataCached(ctx)
18 | if err != nil {
19 | e.logger.Error("failed to get metadata", zap.Error(err))
20 | return false
21 | }
22 |
23 | topicConfigs, err := e.minionSvc.GetTopicConfigs(ctx)
24 | if err != nil {
25 | e.logger.Error("failed to get topic configs", zap.Error(err))
26 | return false
27 | }
28 |
29 | isOk := true
30 | // ConfigsByTopic is indexed by topic name and config resource name (inner key)
31 | configsByTopic := make(map[string]map[string]string)
32 | for _, resource := range topicConfigs.Resources {
33 | configsByTopic[resource.ResourceName] = make(map[string]string)
34 | typedErr := kerr.TypedErrorForCode(resource.ErrorCode)
35 | if typedErr != nil {
36 | isOk = false
37 | e.logger.Warn("failed to get topic config of a specific topic",
38 | zap.String("topic_name", resource.ResourceName),
39 | zap.Error(typedErr))
40 | continue
41 | }
42 |
43 | for _, config := range resource.Configs {
44 | confVal := "nil"
45 | if config.Value != nil {
46 | confVal = *config.Value
47 | }
48 | configsByTopic[resource.ResourceName][config.Name] = confVal
49 | }
50 |
51 | }
52 |
53 | for _, topic := range metadata.Topics {
54 | topicName := *topic.Topic
55 | if !e.minionSvc.IsTopicAllowed(topicName) {
56 | continue
57 | }
58 | typedErr := kerr.TypedErrorForCode(topic.ErrorCode)
59 | if typedErr != nil {
60 | isOk = false
61 | e.logger.Warn("failed to get metadata of a specific topic",
62 | zap.String("topic_name", topicName),
63 | zap.Error(typedErr))
64 | continue
65 | }
66 | partitionCount := len(topic.Partitions)
67 | replicationFactor := -1
68 | if partitionCount > 0 {
69 | // It should never be possible to skip this, but just to be safe we'll check this so that we don't cause panics
70 | replicationFactor = len(topic.Partitions[0].Replicas)
71 | }
72 |
73 | var labelsValues []string
74 | labelsValues = append(labelsValues, topicName)
75 | labelsValues = append(labelsValues, strconv.Itoa(partitionCount))
76 | labelsValues = append(labelsValues, strconv.Itoa(replicationFactor))
77 | for _, key := range e.minionSvc.Cfg.Topics.InfoMetric.ConfigKeys {
78 | labelsValues = append(labelsValues, getOrDefault(configsByTopic[topicName], key, "N/A"))
79 | }
80 | ch <- prometheus.MustNewConstMetric(
81 | e.topicInfo,
82 | prometheus.GaugeValue,
83 | float64(1),
84 | labelsValues...,
85 | )
86 | }
87 | return isOk
88 | }
89 |
90 | func getOrDefault(m map[string]string, key string, defaultValue string) string {
91 | if value, exists := m[key]; exists {
92 | return value
93 | }
94 | return defaultValue
95 | }
96 |
--------------------------------------------------------------------------------
/prometheus/collect_topic_partition_offsets.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "context"
5 | "strconv"
6 |
7 | "github.com/prometheus/client_golang/prometheus"
8 | "go.uber.org/zap"
9 |
10 | "github.com/cloudhut/kminion/v2/minion"
11 | )
12 |
13 | func (e *Exporter) collectTopicPartitionOffsets(ctx context.Context, ch chan<- prometheus.Metric) bool {
14 | if !e.minionSvc.Cfg.Topics.Enabled {
15 | return true
16 | }
17 |
18 | isOk := true
19 |
20 | // Low Watermarks
21 | lowWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -2)
22 | if err != nil {
23 | e.logger.Error("failed to fetch low water marks", zap.Error(err))
24 | return false
25 | }
26 | // High Watermarks
27 | highWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -1)
28 | if err != nil {
29 | e.logger.Error("failed to fetch low water marks", zap.Error(err))
30 | return false
31 | }
32 |
33 | // Process Low Watermarks
34 |
35 | for topicName, partitions := range lowWaterMarks {
36 | if !e.minionSvc.IsTopicAllowed(topicName) {
37 | continue
38 | }
39 |
40 | waterMarkSum := int64(0)
41 | hasErrors := false
42 | for _, offset := range partitions {
43 | if offset.Err != nil {
44 | hasErrors = true
45 | isOk = false
46 | continue
47 | }
48 | waterMarkSum += offset.Offset
49 | // Let's end here if partition metrics shall not be exposed
50 | if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic {
51 | continue
52 | }
53 | ch <- prometheus.MustNewConstMetric(
54 | e.partitionLowWaterMark,
55 | prometheus.GaugeValue,
56 | float64(offset.Offset),
57 | topicName,
58 | strconv.Itoa(int(offset.Partition)),
59 | )
60 | }
61 | // We only want to report the sum of all partition marks if we receive watermarks from all partition
62 | if !hasErrors {
63 | ch <- prometheus.MustNewConstMetric(
64 | e.topicLowWaterMarkSum,
65 | prometheus.GaugeValue,
66 | float64(waterMarkSum),
67 | topicName,
68 | )
69 | }
70 | }
71 |
72 | for topicName, partitions := range highWaterMarks {
73 | if !e.minionSvc.IsTopicAllowed(topicName) {
74 | continue
75 | }
76 | waterMarkSum := int64(0)
77 | hasErrors := false
78 | for _, offset := range partitions {
79 | if offset.Err != nil {
80 | hasErrors = true
81 | isOk = false
82 | continue
83 | }
84 | waterMarkSum += offset.Offset
85 | // Let's end here if partition metrics shall not be exposed
86 | if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic {
87 | continue
88 | }
89 | ch <- prometheus.MustNewConstMetric(
90 | e.partitionHighWaterMark,
91 | prometheus.GaugeValue,
92 | float64(offset.Offset),
93 | topicName,
94 | strconv.Itoa(int(offset.Partition)),
95 | )
96 | }
97 | // We only want to report the sum of all partition marks if we receive watermarks from all partitions
98 | if !hasErrors {
99 | ch <- prometheus.MustNewConstMetric(
100 | e.topicHighWaterMarkSum,
101 | prometheus.GaugeValue,
102 | float64(waterMarkSum),
103 | topicName,
104 | )
105 | }
106 | }
107 |
108 | return isOk
109 | }
110 |
--------------------------------------------------------------------------------
/prometheus/config.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | type Config struct {
4 | Host string `koanf:"host"`
5 | Port int `koanf:"port"`
6 | Namespace string `koanf:"namespace"`
7 | }
8 |
9 | func (c *Config) SetDefaults() {
10 | c.Port = 8080
11 | c.Namespace = "kminion"
12 | }
13 |
--------------------------------------------------------------------------------