├── .github └── workflows │ ├── docker-image.yml │ └── goreleaser.yml ├── .gitignore ├── .goreleaser.yml ├── Dockerfile ├── LICENSE ├── README.md ├── charts ├── README.md ├── archives │ ├── .gitkeep │ ├── index.yaml │ ├── kminion-0.1.0.tgz │ ├── kminion-0.1.1.tgz │ ├── kminion-0.1.2.tgz │ ├── kminion-0.1.3.tgz │ ├── kminion-0.11.1.tgz │ ├── kminion-0.11.2.tgz │ ├── kminion-0.11.3.tgz │ ├── kminion-0.12.0.tgz │ ├── kminion-0.2.0.tgz │ ├── kminion-0.2.1.tgz │ ├── kminion-0.2.2.tgz │ ├── kminion-0.3.0.tgz │ ├── kminion-0.3.1.tgz │ ├── kminion-0.4.0.tgz │ ├── kminion-0.5.0.tgz │ ├── kminion-0.6.0.tgz │ ├── kminion-0.7.0.tgz │ ├── kminion-0.8.0.tgz │ ├── kminion-0.8.1.tgz │ ├── kminion-0.8.2.tgz │ ├── kminion-0.8.3.tgz │ └── kminion-0.9.0.tgz └── kminion │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── configmap.yaml │ ├── daemonset.yaml │ ├── deployment.yaml │ ├── hpa.yaml │ ├── ingress.yaml │ ├── poddisruptionbudget.yaml │ ├── service.yaml │ ├── serviceaccount.yaml │ └── servicemonitor.yaml │ └── values.yaml ├── config.go ├── docker-compose.yml ├── docs ├── end-to-end.md ├── metrics.md ├── reference-config.yaml └── screenshots │ ├── kminion-cluster.png │ ├── kminion-groups.png │ └── kminion-topics.png ├── e2e ├── client_hooks.go ├── config.go ├── config_consumer.go ├── config_producer.go ├── config_topic.go ├── consumer.go ├── endtoend_message.go ├── group_tracker.go ├── message_tracker.go ├── producer.go ├── service.go ├── topic.go ├── topic_test.go └── utils.go ├── go.mod ├── go.sum ├── kafka ├── client_config_helper.go ├── client_logger.go ├── config.go ├── config_sasl.go ├── config_sasl_gssapi.go ├── config_sasl_oauthbearer.go ├── config_tls.go └── service.go ├── logging ├── config.go └── logger.go ├── main.go ├── minion ├── client_hooks.go ├── config.go ├── config_consumer_group.go ├── config_log_dirs.go ├── config_topic_config.go ├── consumer_group_offsets.go ├── describe_consumer_groups.go ├── describe_topic_config.go ├── list_offsets.go ├── log_dirs.go ├── metadata.go ├── offset_consumer.go ├── service.go ├── storage.go ├── utils.go └── versions.go └── prometheus ├── collect_broker_info.go ├── collect_cluster_info.go ├── collect_consumer_group_lags.go ├── collect_consumer_groups.go ├── collect_exporter_metrics.go ├── collect_log_dirs.go ├── collect_topic_info.go ├── collect_topic_partition_offsets.go ├── config.go └── exporter.go /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: docker-image 3 | on: 4 | push: 5 | tags: ['*'] 6 | branches: ['master'] 7 | paths-ignore: ['charts/**'] 8 | permissions: 9 | id-token: write 10 | contents: read 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: aws-actions/configure-aws-credentials@v4 16 | with: 17 | aws-region: ${{ vars.RP_AWS_CRED_REGION }} 18 | role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }} 19 | - uses: aws-actions/aws-secretsmanager-get-secrets@v2 20 | with: 21 | secret-ids: | 22 | ,sdlc/prod/github/dockerhub 23 | parse-json-secrets: true 24 | - uses: actions/checkout@v4 25 | - uses: docker/setup-qemu-action@v3 26 | - uses: docker/setup-buildx-action@v3 27 | with: 28 | driver-opts: | 29 | image=moby/buildkit:v0.21.1 30 | network=host 31 | - name: Set build date 32 | run: | 33 | echo "BUILT_AT=$(date --rfc-3339=date)" >> ${GITHUB_ENV} 34 | - uses: docker/metadata-action@v5 35 | id: docker_meta 36 | with: 37 | # list of Docker images to use as base name for tags 38 | images: | 39 | redpandadata/kminion 40 | name=public.ecr.aws/l9j0i2e0/kminion,enable=${{ startsWith(github.ref, 'refs/tags/v') }} 41 | # generate Docker tags based on the following events/attributes 42 | # Semver type is only active on 'push tag' events, 43 | # hence no enable condition required 44 | tags: | 45 | type=sha,prefix={{branch}}-,format=short,enable={{is_default_branch}} 46 | type=semver,pattern={{raw}} 47 | - uses: docker/login-action@v3 48 | with: 49 | username: ${{ env.DOCKERHUB_USER }} 50 | password: ${{ env.DOCKERHUB_TOKEN }} 51 | - uses: aws-actions/configure-aws-credentials@v4 52 | if: ${{ startsWith(github.ref, 'refs/tags/v') }} 53 | with: 54 | aws-region: us-east-1 55 | role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }} 56 | - uses: aws-actions/amazon-ecr-login@v2 57 | if: ${{ startsWith(github.ref, 'refs/tags/v') }} 58 | with: 59 | registry-type: public 60 | - uses: docker/build-push-action@v6 61 | with: 62 | provenance: false 63 | push: true 64 | platforms: linux/amd64,linux/arm64 65 | tags: ${{ steps.docker_meta.outputs.tags }} 66 | build-args: | 67 | VERSION=${{ fromJSON(steps.docker_meta.outputs.json).labels['org.opencontainers.image.version'] }} 68 | BUILT_AT=${{ env.BUILT_AT }} 69 | COMMIT=${{ github.sha }} 70 | cache-from: type=gha 71 | cache-to: type=gha,mode=max 72 | -------------------------------------------------------------------------------- /.github/workflows/goreleaser.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: goreleaser 3 | on: 4 | push: 5 | tags: ['*'] 6 | jobs: 7 | goreleaser: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: write 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 15 | - uses: actions/setup-go@v5 16 | with: 17 | go-version-file: 'go.mod' 18 | - uses: goreleaser/goreleaser-action@v6 19 | if: startsWith(github.ref, 'refs/tags/') 20 | with: 21 | version: latest 22 | args: release --clean 23 | workdir: . 24 | env: 25 | CGO_ENABLED: 0 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | zk-single-kafka-single 14 | zk-multiple-kafka-multiple 15 | .vscode 16 | .idea 17 | 18 | config 19 | /kminion 20 | -------------------------------------------------------------------------------- /.goreleaser.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | release: 4 | name_template: '{{.Version}} / {{time "2006-01-02"}}' 5 | prerelease: auto 6 | mode: append 7 | footer: | 8 | ## Docker Image 9 | Use the following command to pull this release's Docker image: 10 | ```sh 11 | docker pull redpandadata/kminion:{{ .Tag }} 12 | ``` 13 | changelog: 14 | disable: false 15 | use: github 16 | filters: 17 | # Commit messages matching the regexp listed here will be removed from the changelog 18 | exclude: 19 | - '^docs:' 20 | - '^test:' 21 | - '^npm:' 22 | - '^go.mod:' 23 | - '^.github:' 24 | - 'Merge branch' 25 | builds: 26 | - id: kminion 27 | binary: kminion 28 | goos: 29 | - darwin 30 | - linux 31 | - windows 32 | goarch: 33 | - amd64 34 | - arm64 35 | ldflags: 36 | - -s -w -X main.version={{.Version}} -X main.builtAt={{.Date}} -X main.commit={{.Commit}} 37 | checksum: 38 | name_template: 'checksums.txt' 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Build image 3 | ############################################################ 4 | FROM golang:1.24-alpine AS builder 5 | 6 | ARG VERSION 7 | ARG BUILT_AT 8 | ARG COMMIT 9 | 10 | RUN apk update && apk add --no-cache git ca-certificates && update-ca-certificates 11 | 12 | WORKDIR /app 13 | 14 | COPY go.mod . 15 | COPY go.sum . 16 | RUN go mod download 17 | 18 | COPY . . 19 | 20 | RUN CGO_ENABLED=0 go build \ 21 | -ldflags="-w -s \ 22 | -X main.version=$VERSION \ 23 | -X main.commit=$COMMIT \ 24 | -X main.builtAt=$BUILT_AT" \ 25 | -o ./bin/kminion 26 | 27 | ############################################################ 28 | # Runtime Image 29 | ############################################################ 30 | FROM alpine:3 31 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ 32 | COPY --from=builder /app/bin/kminion /app/kminion 33 | RUN addgroup -S redpanda \ 34 | && adduser -S redpanda -G redpanda \ 35 | && chmod o+rx /app/kminion \ 36 | && apk upgrade --no-cache 37 | USER redpanda 38 | 39 | ENTRYPOINT ["/app/kminion"] 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 CloudHut 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Exporter for Apache Kafka - KMinion 2 | 3 | KMinion (previously known as Kafka Minion) is a feature-rich and flexible Prometheus Exporter to monitor your Apache 4 | Kafka cluster. All valuable information that are accessible via the Kafka protocol are supposed to be accessible using 5 | KMinion. 6 | 7 | ## 🚀 Features 8 | 9 | - **Kafka versions:** Supports all Kafka versions v0.11+ 10 | - **Supported SASL mechanisms:** plain, scram-sha-256/512, gssapi/kerberos 11 | - **TLS support:** TLS is supported, regardless whether you need mTLS, a custom CA, encrypted keys or just the trusted 12 | root certs 13 | - **Consumer Group Lags:** Number of messages a consumer group is lagging behind the latest offset 14 | - **Log dir sizes:** Metric for log dir sizes either grouped by broker or by topic 15 | - **Broker info:** Metric for each broker with its address, broker id, controller and rack id 16 | - **Configurable granularity:** Export metrics (e.g. consumer group lags) either per partition or per topic. Helps to reduce the number of exported metric series. 17 | - **End to End Monitoring:** Sends messages to its own topic and consumes them, measuring a messages real-world "roundtrip" latency. Also provides ack-latency and offset-commit-latency. [More Info](/docs/end-to-end.md) 18 | - **Configurable targets:** You can configure what topics or groups you'd like to export using regex expressions 19 | - **Multiple config parsers:** It's possible to configure KMinion using YAML, Environment variables or a mix of both 20 | 21 | You can find a list of all exported metrics here: [/docs/metrics.md](/docs/metrics.md) 22 | 23 | ## Getting started 24 | 25 | ### 🐳 Docker image 26 | 27 | All images will be built on each push to master or for every new release. You can find an overview of all available tags 28 | in our [DockerHub repository](https://hub.docker.com/r/redpandadata/kminion/tags). 29 | 30 | ```shell 31 | docker pull redpandadata/kminion:latest 32 | ``` 33 | 34 | ### ☸ Helm chart 35 | 36 | A Helm chart will be maintained as part of Redpanda's [helm-charts](https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion) repository. 37 | 38 | ### 🔧 Configuration 39 | 40 | All options in KMinion can be configured via YAML or environment variables. Configuring some options via YAML and some 41 | via environment variables is also possible. Environment variables take precedence in this case. You can find the 42 | reference config with additional documentation in [/docs/reference-config.yaml](/docs/reference-config.yaml). 43 | 44 | If you want to use a YAML config file, specify the path to the config file by setting the env variable 45 | `CONFIG_FILEPATH`. 46 | 47 | ### 📊 Grafana Dashboards 48 | 49 | I uploaded three separate Grafana dashboards that can be used as inspiration in order to create your own dashboards. Please take note that these dashboards might not immediately work for you due to different labeling in your Prometheus config. 50 | 51 | Cluster Dashboard: https://grafana.com/grafana/dashboards/14012 52 | 53 | Consumer Group Dashboard: https://grafana.com/grafana/dashboards/14014 54 | 55 | Topic Dashboard: https://grafana.com/grafana/dashboards/14013 56 | 57 |

58 | 59 | 60 | 61 |

62 | 63 | ### ⚡ Testing locally 64 | 65 | This repo contains a docker-compose file that you can run on your machine. It will spin up a Kafka & ZooKeeper cluster 66 | and starts KMinion on port 8080 which is exposed to your host machine: 67 | 68 | ```shell 69 | # 1. Clone this repo 70 | # 2. Browse to the repo's root directory and run: 71 | docker-compose up 72 | ``` 73 | 74 | ## Chat with us 75 | 76 | We use Slack to communicate. If you are looking for more interactive discussions or support, you are invited to join 77 | our Slack server: https://redpanda.com/slack 78 | 79 | ## License 80 | 81 | KMinion is distributed under the [MIT License](https://github.com/cloudhut/kminion/blob/master/LICENSE). 82 | -------------------------------------------------------------------------------- /charts/README.md: -------------------------------------------------------------------------------- 1 | # Helm Chart 2 | 3 | ⚠️ This chart has been moved to https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion . Please install this chart instead. The existing archives are still being hosted here, to not break existing deployments. 4 | 5 | --- 6 | 7 | This chart is intentionally very light on input validation. The goal was to offer a flexible Helm chart that allows 8 | users to deploy KMinion the way they want to. Therefore it's very flexible at the cost of less input validation, so that 9 | you might run into runtime errors for a misconfiguration. 10 | 11 | All available input is documented inside of the [values.yaml](./kminion/values.yaml) file. 12 | 13 | ## Installing the Helm chart 14 | 15 | ```shell 16 | helm repo add kminion https://raw.githubusercontent.com/cloudhut/kminion/master/charts/archives 17 | helm repo update 18 | helm install -f values.yaml kminion kminion/kminion 19 | ``` 20 | -------------------------------------------------------------------------------- /charts/archives/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/.gitkeep -------------------------------------------------------------------------------- /charts/archives/index.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | entries: 3 | kminion: 4 | - apiVersion: v2 5 | appVersion: v2.2.5 6 | created: "2023-07-03T16:38:22.568312+01:00" 7 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 8 | Kafka 9 | digest: 32e2ee36d0b0a045061d4e1490780fef905b4c85d7a23659819c5cb128aaa119 10 | name: kminion 11 | type: application 12 | urls: 13 | - kminion-0.12.0.tgz 14 | version: 0.12.0 15 | - apiVersion: v2 16 | appVersion: v2.2.5 17 | created: "2023-07-03T16:38:22.567922+01:00" 18 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 19 | Kafka 20 | digest: 8a7be130d57f6f8ead720277b69319ff4dcd364859e80f4750416abe5ed460c3 21 | name: kminion 22 | type: application 23 | urls: 24 | - kminion-0.11.3.tgz 25 | version: 0.11.3 26 | - apiVersion: v2 27 | appVersion: v2.2.3 28 | created: "2023-07-03T16:38:22.5675+01:00" 29 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 30 | Kafka 31 | digest: 42991a871f58b6d31a9e5b38539eb3d1e9cd35c0097a0fcf63f21f818fa7a999 32 | name: kminion 33 | type: application 34 | urls: 35 | - kminion-0.11.2.tgz 36 | version: 0.11.2 37 | - apiVersion: v2 38 | appVersion: v2.2.3 39 | created: "2023-07-03T16:38:22.566877+01:00" 40 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 41 | Kafka 42 | digest: 65d7231f1e8ee586bec42bc383b66726d596fe03e0f3183e14b688174a3a8112 43 | name: kminion 44 | type: application 45 | urls: 46 | - kminion-0.11.1.tgz 47 | version: 0.11.1 48 | - apiVersion: v2 49 | appVersion: v2.2.0 50 | created: "2023-07-03T16:38:22.575384+01:00" 51 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 52 | Kafka 53 | digest: 358bdd509f573049d4bfe77d2edb94c7ad3938f609aea11a8e2c2dc65cca2a9a 54 | name: kminion 55 | type: application 56 | urls: 57 | - kminion-0.9.0.tgz 58 | version: 0.9.0 59 | - apiVersion: v2 60 | appVersion: v2.2.0 61 | created: "2023-07-03T16:38:22.574906+01:00" 62 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 63 | Kafka 64 | digest: be8f0047b345d3954fc7c7e7f8953a848c909ef253107d6e77ed747843ddd167 65 | name: kminion 66 | type: application 67 | urls: 68 | - kminion-0.8.3.tgz 69 | version: 0.8.3 70 | - apiVersion: v2 71 | appVersion: v2.1.0 72 | created: "2023-07-03T16:38:22.573746+01:00" 73 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 74 | Kafka 75 | digest: 888bc665cddc6b6b99af1ce6dd1dea0b107a2e928dff6bfe1c077bc741e20ef7 76 | name: kminion 77 | type: application 78 | urls: 79 | - kminion-0.8.2.tgz 80 | version: 0.8.2 81 | - apiVersion: v2 82 | appVersion: v2.1.0 83 | created: "2023-07-03T16:38:22.573271+01:00" 84 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 85 | Kafka 86 | digest: e59c5d5574f162708bf1434c266acbfd9040a89aa7a4abd4a0db70885248e38d 87 | name: kminion 88 | type: application 89 | urls: 90 | - kminion-0.8.1.tgz 91 | version: 0.8.1 92 | - apiVersion: v2 93 | appVersion: v2.1.0 94 | created: "2023-07-03T16:38:22.572697+01:00" 95 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 96 | Kafka 97 | digest: f54d8236f8cf03c863b53e077e1647164ffe2a7c34e1cf77101fa3312c589706 98 | name: kminion 99 | type: application 100 | urls: 101 | - kminion-0.8.0.tgz 102 | version: 0.8.0 103 | - apiVersion: v2 104 | appVersion: v2.1.0 105 | created: "2023-07-03T16:38:22.572269+01:00" 106 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 107 | Kafka 108 | digest: 4cc64cd9f78bd55673b00612579157e493020fb76440abbef10fe5152aef9acc 109 | name: kminion 110 | type: application 111 | urls: 112 | - kminion-0.7.0.tgz 113 | version: 0.7.0 114 | - apiVersion: v2 115 | appVersion: v2.1.0 116 | created: "2023-07-03T16:38:22.571852+01:00" 117 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 118 | Kafka 119 | digest: 0955e04fe9ef4b516fb0d9ed439ae79778ccdffcf817f09099790cb7e183e4d4 120 | name: kminion 121 | type: application 122 | urls: 123 | - kminion-0.6.0.tgz 124 | version: 0.6.0 125 | - apiVersion: v2 126 | appVersion: v2.0.0 127 | created: "2023-07-03T16:38:22.571391+01:00" 128 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 129 | Kafka 130 | digest: d3eb64d05535e136802538662eef7e9fdfdb3f0b93b6a42dfdcc93ee7deeadbd 131 | name: kminion 132 | type: application 133 | urls: 134 | - kminion-0.5.0.tgz 135 | version: 0.5.0 136 | - apiVersion: v2 137 | appVersion: v2.0.0 138 | created: "2023-07-03T16:38:22.570618+01:00" 139 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 140 | Kafka 141 | digest: 6b4209352d1dffd7873791ee1573dc325eb08d67656b01b430729f45dea4c09a 142 | name: kminion 143 | type: application 144 | urls: 145 | - kminion-0.4.0.tgz 146 | version: 0.4.0 147 | - apiVersion: v2 148 | appVersion: v2.0.0 149 | created: "2023-07-03T16:38:22.570281+01:00" 150 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 151 | Kafka 152 | digest: c51e3b45791e9fd51f33036916b0d36f7ac695e2fa916a9e99882ea83914ed97 153 | name: kminion 154 | type: application 155 | urls: 156 | - kminion-0.3.1.tgz 157 | version: 0.3.1 158 | - apiVersion: v2 159 | appVersion: v2.0.0 160 | created: "2023-07-03T16:38:22.569892+01:00" 161 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 162 | Kafka 163 | digest: a2be2dd8a02dc5222ec7386195a0e25b2682a39bbdcf52b60793c171acac7653 164 | name: kminion 165 | type: application 166 | urls: 167 | - kminion-0.3.0.tgz 168 | version: 0.3.0 169 | - apiVersion: v2 170 | appVersion: v2.0.0 171 | created: "2023-07-03T16:38:22.569445+01:00" 172 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 173 | Kafka 174 | digest: 321b6d5ff95ce310d2a3257b3d55f9ced51de99af6519d6d91723d7bdb6456fa 175 | name: kminion 176 | type: application 177 | urls: 178 | - kminion-0.2.2.tgz 179 | version: 0.2.2 180 | - apiVersion: v2 181 | appVersion: v2.0.0 182 | created: "2023-07-03T16:38:22.569089+01:00" 183 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 184 | Kafka 185 | digest: ed57df27158521a1eb33d215731fcc3248c71b3f36a4a029eb2d3a7b617ca519 186 | name: kminion 187 | type: application 188 | urls: 189 | - kminion-0.2.1.tgz 190 | version: 0.2.1 191 | - apiVersion: v2 192 | appVersion: v2.0.0 193 | created: "2023-07-03T16:38:22.568694+01:00" 194 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 195 | Kafka 196 | digest: 025661ee7cc574ad8dde7a68093a3b614fc92e26dd5dd398fc89d0b5308010e1 197 | name: kminion 198 | type: application 199 | urls: 200 | - kminion-0.2.0.tgz 201 | version: 0.2.0 202 | - apiVersion: v2 203 | appVersion: v2.0.0 204 | created: "2023-07-03T16:38:22.566269+01:00" 205 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 206 | Kafka 207 | digest: e277e976d864b4bd2e505038dd865a9300486ae8c4323d3f0be40b84df75732b 208 | name: kminion 209 | type: application 210 | urls: 211 | - kminion-0.1.3.tgz 212 | version: 0.1.3 213 | - apiVersion: v2 214 | appVersion: v2.0.0 215 | created: "2023-07-03T16:38:22.565773+01:00" 216 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 217 | Kafka 218 | digest: 562937d3613624c55984e51adbc6765e7898d1cf8cc2d7d241b6d671bbc12303 219 | name: kminion 220 | type: application 221 | urls: 222 | - kminion-0.1.2.tgz 223 | version: 0.1.2 224 | - apiVersion: v2 225 | appVersion: v2.0.0 226 | created: "2023-07-03T16:38:22.562776+01:00" 227 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 228 | Kafka 229 | digest: 25e83d7c7cc92a63268d76b13ecc13077758b48be093490f281498a4f55ad3ca 230 | name: kminion 231 | type: application 232 | urls: 233 | - kminion-0.1.1.tgz 234 | version: 0.1.1 235 | - apiVersion: v2 236 | appVersion: v2.0.0 237 | created: "2023-07-03T16:38:22.562046+01:00" 238 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache 239 | Kafka 240 | digest: 7c10e9d9957e9752bc6f4b4a1fffb742d88cd57be06bf4f26ff7b5031645ccbd 241 | name: kminion 242 | type: application 243 | urls: 244 | - kminion-0.1.0.tgz 245 | version: 0.1.0 246 | generated: "2023-07-03T16:38:22.560328+01:00" 247 | -------------------------------------------------------------------------------- /charts/archives/kminion-0.1.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.1.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.1.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.1.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.2.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.1.3.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.1.3.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.11.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.1.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.11.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.2.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.11.3.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.11.3.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.12.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.12.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.2.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.2.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.1.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.2.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.2.2.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.3.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.3.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.3.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.3.1.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.4.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.4.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.5.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.5.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.6.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.6.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.7.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.7.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.8.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.0.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.8.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.1.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.8.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.2.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.8.3.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.8.3.tgz -------------------------------------------------------------------------------- /charts/archives/kminion-0.9.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/charts/archives/kminion-0.9.0.tgz -------------------------------------------------------------------------------- /charts/kminion/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/kminion/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: kminion 3 | description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.12.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "v2.2.5" 25 | -------------------------------------------------------------------------------- /charts/kminion/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if .Values.ingress.enabled }} 3 | {{- range .Values.ingress.hosts }} 4 | http://{{ . }} 5 | {{- end }} 6 | {{- else if contains "NodePort" .Values.service.type }} 7 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kminion.fullname" . }}) 8 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 9 | echo http://$NODE_IP:$NODE_PORT 10 | {{- else if contains "LoadBalancer" .Values.service.type }} 11 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 12 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kminion.fullname" . }}' 13 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kminion.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 14 | echo http://$SERVICE_IP:{{ .Values.service.port }} 15 | {{- else if contains "ClusterIP" .Values.service.type }} 16 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kminion.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 17 | export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") 18 | echo "Visit http://127.0.0.1:8080 to use your application" 19 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT 20 | {{- end }} 21 | -------------------------------------------------------------------------------- /charts/kminion/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "kminion.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "kminion.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "kminion.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "kminion.labels" -}} 37 | helm.sh/chart: {{ include "kminion.chart" . }} 38 | {{ include "kminion.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- if .Values.customLabels}} 44 | {{ toYaml .Values.customLabels }} 45 | {{- end}} 46 | {{- end }} 47 | 48 | {{/* 49 | Selector labels 50 | */}} 51 | {{- define "kminion.selectorLabels" -}} 52 | app.kubernetes.io/name: {{ include "kminion.name" . }} 53 | app.kubernetes.io/instance: {{ .Release.Name }} 54 | {{- end }} 55 | 56 | {{/* 57 | Create the name of the service account to use 58 | */}} 59 | {{- define "kminion.serviceAccountName" -}} 60 | {{- if .Values.serviceAccount.create }} 61 | {{- default (include "kminion.fullname" .) .Values.serviceAccount.name }} 62 | {{- else }} 63 | {{- default "default" .Values.serviceAccount.name }} 64 | {{- end }} 65 | {{- end }} 66 | 67 | {{/* 68 | Return the appropriate apiVersion for ingress. 69 | */}} 70 | {{- define "kminion.ingress.apiVersion" -}} 71 | {{- if and ($.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) }} 72 | {{- print "networking.k8s.io/v1" }} 73 | {{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" }} 74 | {{- print "networking.k8s.io/v1beta1" }} 75 | {{- else }} 76 | {{- print "extensions/v1beta1" }} 77 | {{- end }} 78 | {{- end }} 79 | {{/* 80 | Return if ingress is stable. 81 | */}} 82 | {{- define "kminion.ingress.isStable" -}} 83 | {{- eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1" }} 84 | {{- end }} 85 | {{/* 86 | Return if ingress supports ingressClassName. 87 | */}} 88 | {{- define "kminion.ingress.supportsIngressClassName" -}} 89 | {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }} 90 | {{- end }} 91 | 92 | {{/* 93 | Return if ingress supports pathType. 94 | */}} 95 | {{- define "kminion.ingress.supportsPathType" -}} 96 | {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }} 97 | {{- end }} 98 | 99 | {{/* 100 | Return the appropriate apiVersion for podDisruptionBudget. 101 | */}} 102 | {{- define "kminion.podDisruptionBudget.apiVersion" -}} 103 | {{- if $.Capabilities.APIVersions.Has "policy/v1/PodDisruptionBudget" }} 104 | {{- print "policy/v1" }} 105 | {{- else }} 106 | {{- print "policy/v1beta1" }} 107 | {{- end }} 108 | {{- end }} 109 | -------------------------------------------------------------------------------- /charts/kminion/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{include "kminion.fullname" .}} 5 | namespace: {{ .Release.Namespace | quote }} 6 | labels: 7 | {{- include "kminion.labels" . | nindent 4}} 8 | data: 9 | config.yaml: | 10 | {{- toYaml .Values.kminion.config | nindent 4}} 11 | -------------------------------------------------------------------------------- /charts/kminion/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.daemonset.enabled }} 2 | apiVersion: apps/v1 3 | kind: DaemonSet 4 | metadata: 5 | name: {{include "kminion.fullname" .}} 6 | namespace: {{ .Release.Namespace | quote }} 7 | labels: 8 | {{- include "kminion.labels" . | nindent 4}} 9 | spec: 10 | updateStrategy: 11 | type: OnDelete 12 | selector: 13 | matchLabels: 14 | {{- include "kminion.selectorLabels" . | nindent 6}} 15 | template: 16 | metadata: 17 | {{- with .Values.podAnnotations}} 18 | annotations: 19 | {{- toYaml . | nindent 8}} 20 | {{- end}} 21 | labels: 22 | {{- include "kminion.selectorLabels" . | nindent 8}} 23 | {{- if .Values.customLabels}} 24 | {{toYaml .Values.customLabels | nindent 8}} 25 | {{- end}} 26 | spec: 27 | {{- with .Values.imagePullSecrets}} 28 | imagePullSecrets: 29 | {{- toYaml . | nindent 8}} 30 | {{- end}} 31 | securityContext: 32 | {{- toYaml .Values.podSecurityContext | nindent 8}} 33 | serviceAccountName: {{ .Values.serviceAccount.name }} 34 | volumes: 35 | - name: config 36 | configMap: 37 | name: {{include "kminion.fullname" .}} 38 | {{- range .Values.deployment.volumes.secrets}} 39 | - name: {{.secretName}} 40 | secret: 41 | secretName: {{.secretName}} 42 | {{- end}} 43 | containers: 44 | - name: {{.Chart.Name}} 45 | securityContext: 46 | {{- toYaml .Values.securityContext | nindent 12}} 47 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 48 | imagePullPolicy: {{.Values.image.pullPolicy}} 49 | ports: 50 | - name: metrics 51 | containerPort: {{.Values.service.port}} 52 | protocol: TCP 53 | env: 54 | - name: POD_NAME 55 | valueFrom: 56 | fieldRef: 57 | fieldPath: metadata.name 58 | - name: POD_NAMESPACE 59 | valueFrom: 60 | fieldRef: 61 | fieldPath: metadata.namespace 62 | - name: CONFIG_FILEPATH 63 | value: /etc/kminion/config.yaml 64 | {{- range .Values.deployment.env.values}} 65 | - name: {{.name}} 66 | value: {{.value | quote}} 67 | {{- end}} 68 | {{- range .Values.deployment.env.secretKeyRefs}} 69 | - name: {{.name}} 70 | valueFrom: 71 | secretKeyRef: 72 | name: {{.secretName}} 73 | key: {{.secretKey}} 74 | {{- end}} 75 | {{- range .Values.deployment.env.configMapKeyRefs}} 76 | - name: {{.name}} 77 | valueFrom: 78 | configMapKeyRef: 79 | name: {{.configMapName}} 80 | key: {{.configMapKey}} 81 | {{- end}} 82 | volumeMounts: 83 | - name: config 84 | mountPath: /etc/kminion 85 | {{- range .Values.deployment.volumes.secrets}} 86 | - name: {{.secretName}} 87 | mountPath: {{.mountPath}} 88 | {{- end}} 89 | resources: 90 | {{- toYaml .Values.resources | nindent 12}} 91 | livenessProbe: 92 | failureThreshold: 3 93 | httpGet: 94 | path: /ready 95 | port: metrics 96 | scheme: HTTP 97 | initialDelaySeconds: 10 98 | periodSeconds: 10 99 | successThreshold: 1 100 | timeoutSeconds: 1 101 | readinessProbe: 102 | failureThreshold: 3 103 | httpGet: 104 | path: /ready 105 | port: metrics 106 | scheme: HTTP 107 | periodSeconds: 10 108 | successThreshold: 1 109 | timeoutSeconds: 1 110 | {{- with .Values.affinity}} 111 | affinity: 112 | {{- toYaml . | nindent 8}} 113 | {{- end}} 114 | {{- with .Values.tolerations}} 115 | tolerations: 116 | {{- toYaml . | nindent 8}} 117 | {{- end}} 118 | {{- end }} 119 | -------------------------------------------------------------------------------- /charts/kminion/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{include "kminion.fullname" .}} 5 | namespace: {{ .Release.Namespace | quote }} 6 | labels: 7 | {{- include "kminion.labels" . | nindent 4}} 8 | {{- with .Values.deployment.labels}} 9 | {{- toYaml . | nindent 4}} 10 | {{- end}} 11 | {{- with .Values.deployment.annotations}} 12 | annotations: 13 | {{- toYaml . | nindent 4}} 14 | {{- end}} 15 | spec: 16 | {{- if not .Values.autoscaling.enabled}} 17 | replicas: {{.Values.replicaCount}} 18 | {{- end}} 19 | selector: 20 | matchLabels: 21 | {{- include "kminion.selectorLabels" . | nindent 6}} 22 | template: 23 | metadata: 24 | {{- with .Values.podAnnotations}} 25 | annotations: 26 | {{- toYaml . | nindent 8}} 27 | {{- end}} 28 | labels: 29 | {{- include "kminion.selectorLabels" . | nindent 8}} 30 | {{- if .Values.customLabels}} 31 | {{toYaml .Values.customLabels | nindent 8}} 32 | {{- end}} 33 | spec: 34 | {{- with .Values.imagePullSecrets}} 35 | imagePullSecrets: 36 | {{- toYaml . | nindent 8}} 37 | {{- end}} 38 | serviceAccountName: {{include "kminion.serviceAccountName" .}} 39 | securityContext: 40 | {{- toYaml .Values.podSecurityContext | nindent 8}} 41 | volumes: 42 | - name: config 43 | configMap: 44 | name: {{include "kminion.fullname" .}} 45 | {{- range .Values.deployment.volumes.secrets}} 46 | - name: {{.secretName}} 47 | secret: 48 | secretName: {{.secretName}} 49 | {{- end}} 50 | {{- with .Values.deployment.volumes.extra }} 51 | {{- toYaml . | nindent 8 }} 52 | {{- end }} 53 | initContainers: 54 | {{- with .Values.deployment.initContainers }} 55 | {{- toYaml . | nindent 8 }} 56 | {{- end }} 57 | containers: 58 | - name: {{.Chart.Name}} 59 | securityContext: 60 | {{- toYaml .Values.securityContext | nindent 12}} 61 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 62 | imagePullPolicy: {{.Values.image.pullPolicy}} 63 | ports: 64 | - name: metrics 65 | containerPort: {{.Values.service.port}} 66 | protocol: TCP 67 | env: 68 | - name: CONFIG_FILEPATH 69 | value: /etc/kminion/config.yaml 70 | {{- range .Values.deployment.env.values}} 71 | - name: {{.name}} 72 | value: {{.value | quote}} 73 | {{- end}} 74 | {{- range .Values.deployment.env.secretKeyRefs}} 75 | - name: {{.name}} 76 | valueFrom: 77 | secretKeyRef: 78 | name: {{.secretName}} 79 | key: {{.secretKey}} 80 | {{- end}} 81 | {{- range .Values.deployment.env.configMapKeyRefs}} 82 | - name: {{.name}} 83 | valueFrom: 84 | configMapKeyRef: 85 | name: {{.configMapName}} 86 | key: {{.configMapKey}} 87 | {{- end}} 88 | volumeMounts: 89 | - name: config 90 | mountPath: /etc/kminion 91 | {{- range .Values.deployment.volumes.secrets}} 92 | - name: {{.secretName}} 93 | mountPath: {{.mountPath}} 94 | {{- end}} 95 | resources: 96 | {{- toYaml .Values.resources | nindent 12}} 97 | {{- if .Values.deployment.readinessProbe.enabled }} 98 | readinessProbe: 99 | httpGet: 100 | path: /ready 101 | port: {{.Values.service.port}} 102 | initialDelaySeconds: 10 103 | {{- end }} 104 | {{- with .Values.deployment.extraContainers }} 105 | {{- toYaml . | nindent 8 }} 106 | {{- end }} 107 | {{- with .Values.nodeSelector}} 108 | nodeSelector: 109 | {{- toYaml . | nindent 8}} 110 | {{- end}} 111 | {{- with .Values.affinity}} 112 | affinity: 113 | {{- toYaml . | nindent 8}} 114 | {{- end}} 115 | {{- with .Values.tolerations}} 116 | tolerations: 117 | {{- toYaml . | nindent 8}} 118 | {{- end}} 119 | -------------------------------------------------------------------------------- /charts/kminion/templates/hpa.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.autoscaling.enabled }} 2 | apiVersion: {{ ternary "autoscaling/v2" "autoscaling/v2beta1" (.Capabilities.APIVersions.Has "autoscaling/v2") }} 3 | kind: HorizontalPodAutoscaler 4 | metadata: 5 | name: {{ include "kminion.fullname" . }} 6 | namespace: {{ .Release.Namespace | quote }} 7 | labels: 8 | {{- include "kminion.labels" . | nindent 4 }} 9 | spec: 10 | scaleTargetRef: 11 | apiVersion: apps/v1 12 | kind: Deployment 13 | name: {{ include "kminion.fullname" . }} 14 | minReplicas: {{ .Values.autoscaling.minReplicas }} 15 | maxReplicas: {{ .Values.autoscaling.maxReplicas }} 16 | metrics: 17 | {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} 18 | - type: Resource 19 | resource: 20 | name: cpu 21 | {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }} 22 | target: 23 | type: Utilization 24 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} 25 | {{ else }} 26 | targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} 27 | {{- end }} 28 | {{- end }} 29 | {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} 30 | - type: Resource 31 | resource: 32 | name: memory 33 | {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }} 34 | target: 35 | type: Utilization 36 | averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} 37 | {{ else }} 38 | targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} 39 | {{- end }} 40 | {{- end }} 41 | {{- end }} 42 | -------------------------------------------------------------------------------- /charts/kminion/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | {{- $fullName := include "kminion.fullname" . -}} 3 | {{- $svcPort := .Values.service.port -}} 4 | {{- $ingressApiIsStable := eq (include "kminion.ingress.isStable" .) "true" -}} 5 | {{- $ingressSupportsIngressClassName := eq (include "kminion.ingress.supportsIngressClassName" .) "true" -}} 6 | {{- $ingressSupportsPathType := eq (include "kminion.ingress.supportsPathType" .) "true" -}} 7 | {{- $fullName := include "kminion.fullname" . -}} 8 | {{- $servicePort := .Values.service.port -}} 9 | {{- $ingressPath := .Values.ingress.path -}} 10 | {{- $ingressPathType := .Values.ingress.pathType -}} 11 | {{- $extraPaths := .Values.ingress.extraPaths -}} 12 | 13 | apiVersion: {{ include "kminion.ingress.apiVersion" . }} 14 | kind: Ingress 15 | metadata: 16 | name: {{ $fullName }} 17 | namespace: {{ .Release.Namespace | quote }} 18 | labels: 19 | {{- include "kminion.labels" . | nindent 4 }} 20 | {{- with .Values.ingress.annotations }} 21 | annotations: 22 | {{- toYaml . | nindent 4 }} 23 | {{- end }} 24 | spec: 25 | {{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }} 26 | ingressClassName: {{ .Values.ingress.ingressClassName }} 27 | {{- end -}} 28 | {{- with .Values.ingress.tls }} 29 | tls: 30 | {{- tpl (toYaml .) $ | nindent 4 }} 31 | {{- end }} 32 | rules: 33 | {{- if .Values.ingress.hosts }} 34 | {{- range .Values.ingress.hosts }} 35 | - host: {{ tpl . $ }} 36 | http: 37 | paths: 38 | {{- with $extraPaths }} 39 | {{- toYaml . | nindent 10 }} 40 | {{- end }} 41 | - path: {{ $ingressPath }} 42 | {{- if $ingressSupportsPathType }} 43 | pathType: {{ $ingressPathType }} 44 | {{- end }} 45 | backend: 46 | {{- if $ingressApiIsStable }} 47 | service: 48 | name: {{ $fullName }} 49 | port: 50 | number: {{ $servicePort }} 51 | {{- else }} 52 | serviceName: {{ $fullName }} 53 | servicePort: {{ $servicePort }} 54 | {{- end }} 55 | {{- end }} 56 | {{- else }} 57 | - http: 58 | paths: 59 | - backend: 60 | {{- if $ingressApiIsStable }} 61 | service: 62 | name: {{ $fullName }} 63 | port: 64 | number: {{ $servicePort }} 65 | {{- else }} 66 | serviceName: {{ $fullName }} 67 | servicePort: {{ $servicePort }} 68 | {{- end }} 69 | {{- with $ingressPath }} 70 | path: {{ . }} 71 | {{- end }} 72 | {{- if $ingressSupportsPathType }} 73 | pathType: {{ $ingressPathType }} 74 | {{- end }} 75 | {{- end -}} 76 | {{- end }} 77 | -------------------------------------------------------------------------------- /charts/kminion/templates/poddisruptionbudget.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.podDisruptionBudget }} 2 | apiVersion: {{ include "kminion.podDisruptionBudget.apiVersion" . }} 3 | kind: PodDisruptionBudget 4 | metadata: 5 | name: {{ template "kminion.fullname" . }} 6 | namespace: {{ .Release.Namespace | quote }} 7 | labels: 8 | {{- include "kminion.labels" . | nindent 4}} 9 | spec: 10 | {{- if .Values.podDisruptionBudget.minAvailable }} 11 | minAvailable: {{ .Values.podDisruptionBudget.minAvailable }} 12 | {{- end }} 13 | {{- if .Values.podDisruptionBudget.maxUnavailable }} 14 | maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable }} 15 | {{- end }} 16 | selector: 17 | matchLabels: 18 | {{- include "kminion.selectorLabels" . | nindent 6}} 19 | {{- end }} 20 | -------------------------------------------------------------------------------- /charts/kminion/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "kminion.fullname" . }} 5 | namespace: {{ .Release.Namespace | quote }} 6 | labels: 7 | {{- include "kminion.labels" . | nindent 4 }} 8 | {{- if .Values.service.annotations }} 9 | annotations: 10 | {{- toYaml .Values.service.annotations | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | type: {{ .Values.service.type }} 14 | ports: 15 | - port: {{ .Values.service.port }} 16 | targetPort: metrics 17 | protocol: TCP 18 | name: metrics 19 | {{- if .Values.service.extraPorts }} 20 | {{- toYaml .Values.service.extraPorts | nindent 4 }} 21 | {{- end }} 22 | selector: 23 | {{- include "kminion.selectorLabels" . | nindent 4 }} 24 | -------------------------------------------------------------------------------- /charts/kminion/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "kminion.serviceAccountName" . }} 6 | namespace: {{ .Release.Namespace | quote }} 7 | labels: 8 | {{- include "kminion.labels" . | nindent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /charts/kminion/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceMonitor.create }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: {{include "kminion.fullname" .}} 6 | namespace: {{ .Release.Namespace | quote }} 7 | labels: 8 | {{- include "kminion.labels" . | nindent 4}} 9 | {{- if .Values.serviceMonitor.additionalLabels}} 10 | {{toYaml .Values.serviceMonitor.additionalLabels | nindent 4}} 11 | {{- end}} 12 | spec: 13 | selector: 14 | matchLabels: 15 | {{- include "kminion.labels" . | nindent 6}} 16 | endpoints: 17 | - port: metrics 18 | path: /metrics 19 | honorLabels: {{ .Values.serviceMonitor.honorLabels }} 20 | scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} 21 | interval: {{ .Values.serviceMonitor.interval }} 22 | {{- if .Values.serviceMonitor.relabelings }} 23 | relabelings: 24 | {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }} 25 | {{- end }} 26 | {{- if .Values.serviceMonitor.targetLabels}} 27 | targetLabels: 28 | {{- toYaml .Values.serviceMonitor.targetLabels | nindent 4}} 29 | {{- end}} 30 | {{- if .Values.customLabels }} 31 | podTargetLabels: 32 | {{- (keys .Values.customLabels | sortAlpha) | toYaml | nindent 4 }} 33 | {{- end}} 34 | {{- end }} 35 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | 8 | "github.com/cloudhut/kminion/v2/kafka" 9 | "github.com/cloudhut/kminion/v2/logging" 10 | "github.com/cloudhut/kminion/v2/minion" 11 | "github.com/cloudhut/kminion/v2/prometheus" 12 | "github.com/knadh/koanf" 13 | "github.com/knadh/koanf/parsers/yaml" 14 | "github.com/knadh/koanf/providers/env" 15 | "github.com/knadh/koanf/providers/file" 16 | "github.com/mitchellh/mapstructure" 17 | "go.uber.org/zap" 18 | ) 19 | 20 | type Config struct { 21 | Kafka kafka.Config `koanf:"kafka"` 22 | Minion minion.Config `koanf:"minion"` 23 | Exporter prometheus.Config `koanf:"exporter"` 24 | Logger logging.Config `koanf:"logger"` 25 | } 26 | 27 | func (c *Config) SetDefaults() { 28 | c.Kafka.SetDefaults() 29 | c.Minion.SetDefaults() 30 | c.Exporter.SetDefaults() 31 | c.Logger.SetDefaults() 32 | } 33 | 34 | func (c *Config) Validate() error { 35 | err := c.Kafka.Validate() 36 | if err != nil { 37 | return fmt.Errorf("failed to validate kafka config: %w", err) 38 | } 39 | 40 | err = c.Minion.Validate() 41 | if err != nil { 42 | return fmt.Errorf("failed to validate minion config: %w", err) 43 | } 44 | 45 | err = c.Logger.Validate() 46 | if err != nil { 47 | return fmt.Errorf("failed to validate logger config: %w", err) 48 | } 49 | 50 | return nil 51 | } 52 | 53 | func newConfig(logger *zap.Logger) (Config, error) { 54 | k := koanf.New(".") 55 | var cfg Config 56 | cfg.SetDefaults() 57 | 58 | // 1. Check if a config filepath is set via flags. If there is one we'll try to load the file using a YAML Parser 59 | envKey := "CONFIG_FILEPATH" 60 | configFilepath := os.Getenv(envKey) 61 | if configFilepath == "" { 62 | logger.Info("the env variable '" + envKey + "' is not set, therefore no YAML config will be loaded") 63 | } else { 64 | err := k.Load(file.Provider(configFilepath), yaml.Parser()) 65 | if err != nil { 66 | return Config{}, fmt.Errorf("failed to parse YAML config: %w", err) 67 | } 68 | } 69 | 70 | // We could unmarshal the loaded koanf input after loading both providers, however we want to unmarshal the YAML 71 | // config with `ErrorUnused` set to true, but unmarshal environment variables with `ErrorUnused` set to false (default). 72 | // Rationale: Orchestrators like Kubernetes inject unrelated environment variables, which we still want to allow. 73 | err := k.UnmarshalWithConf("", &cfg, koanf.UnmarshalConf{ 74 | Tag: "", 75 | FlatPaths: false, 76 | DecoderConfig: &mapstructure.DecoderConfig{ 77 | DecodeHook: mapstructure.ComposeDecodeHookFunc( 78 | mapstructure.StringToTimeDurationHookFunc()), 79 | Metadata: nil, 80 | Result: &cfg, 81 | WeaklyTypedInput: true, 82 | ErrorUnused: true, 83 | }, 84 | }) 85 | if err != nil { 86 | return Config{}, err 87 | } 88 | 89 | err = k.Load(env.ProviderWithValue("", ".", func(s string, v string) (string, interface{}) { 90 | // key := strings.Replace(strings.ToLower(s), "_", ".", -1) 91 | key := strings.Replace(strings.ToLower(s), "_", ".", -1) 92 | // Check to exist if we have a configuration option already and see if it's a slice 93 | // If there is a comma in the value, split the value into a slice by the comma. 94 | if strings.Contains(v, ",") { 95 | return key, strings.Split(v, ",") 96 | } 97 | 98 | // Otherwise return the new key with the unaltered value 99 | return key, v 100 | }), nil) 101 | if err != nil { 102 | return Config{}, err 103 | } 104 | 105 | err = k.Unmarshal("", &cfg) 106 | if err != nil { 107 | return Config{}, err 108 | } 109 | 110 | err = cfg.Validate() 111 | if err != nil { 112 | return Config{}, fmt.Errorf("failed to validate config: %w", err) 113 | } 114 | 115 | return cfg, nil 116 | } 117 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '2.1' 3 | 4 | services: 5 | 6 | zookeeper: 7 | image: confluentinc/cp-zookeeper:latest 8 | ports: 9 | - 2181:2181 10 | environment: 11 | ZOOKEEPER_CLIENT_PORT: 2181 12 | ZOOKEEPER_TICK_TIME: 2000 13 | container_name: zookeeper 14 | hostname: zookeeper 15 | 16 | kafka: 17 | image: confluentinc/cp-kafka:latest 18 | hostname: kafka 19 | container_name: kafka 20 | depends_on: 21 | - zookeeper 22 | ports: 23 | - 9092:9092 24 | environment: 25 | KAFKA_BROKER_ID: 1 26 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 28 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 29 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 30 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 31 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 32 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 33 | 34 | kafka-minion: 35 | build: 36 | context: . 37 | dockerfile: ./Dockerfile 38 | hostname: kafka-minion 39 | container_name: kafka-minion 40 | depends_on: 41 | - zookeeper 42 | - kafka 43 | ports: 44 | - 8080:8080 45 | environment: 46 | KAFKA_BROKERS: kafka:29092 47 | restart: unless-stopped -------------------------------------------------------------------------------- /docs/end-to-end.md: -------------------------------------------------------------------------------- 1 | # End-To-End Monitoring 2 | 3 | This page describes the end-to-end monitoring feature in KMinion, how it works, and what metrics it provides. 4 | 5 | ## Motivation 6 | 7 | > What is the issue? Why did we build this feature? 8 | 9 | We can monitor metrics like CPU usage, free disk space, or even consumer group lag. However, these metrics don't give us 10 | a good idea of the performance characteristics an actual, real-world, client experiences when connected to the cluster. 11 | 12 | With the "classic" metrics lots of questions go unanswered: 13 | 14 | - Can a client produce messages to the cluster? 15 | - Can clients produce & consume messages as well as commit group offsets with an acceptable latency? 16 | - Is the cluster in a healthy state from a client's perspective? 17 | 18 | ## Approach & Implementation 19 | 20 | > How do we solve those issues? How does the feature work? 21 | 22 | The most reliably way to get real-world performance and availability metrics is to actually run a producer/consumer 23 | ourselves. This is exactly what the end-to-end monitoring feature does! 24 | 25 | ## High Level Overview 26 | 27 | In order to determine if the cluster is fully operational, and it's performance is within acceptable limits, KMinion 28 | continuously produces and consumes messages to/from the cluster. That way we can measure things like ack-latency, 29 | commit-latency, and roundtrip-time. 30 | 31 | KMinion creates and manages its own topic for the end-to-end test messages. The name of the topic can be configured. 32 | 33 | **The first step** is to create a message and send it to the cluster. 34 | 35 | - Every produced message is added to an internal tracker, so we can recognize messages being "lost". A message is 36 | considered lost if it doesn't arrive back at the consumer within the configured time span. 37 | 38 | **The second step** is to continuously consume the topic. 39 | 40 | - As each message arrives, we calculate its roundtrip time (time from the point the message was created, until KMinion 41 | received it again) 42 | - Consumer group offsets are committed periodically, while also recording the time each commit takes. 43 | 44 | ### Topic Management 45 | 46 | The topic KMinion uses, is created and managed completely automatically (the topic name can be configured though). 47 | 48 | KMinion continuously checks the topic and fixes issues/imbalances automatically: 49 | 50 | - Add partitions to the topic, so it has at least as many partitions as there are brokers. 51 | - Will reassign partitions to ensure every broker leads at least one partition, and that all partitions' replicas are 52 | distributed evenly across the brokers. KMinion tries to assign partitionIDs to brokers that have the same broker id. 53 | 54 | ### Consumer Group Management 55 | 56 | On startup each KMinion instance generates a unique identifier (UUID) that is used to create its own consumer group. It 57 | incorporates the shared prefix from the config. 58 | 59 | That is necessary because: 60 | 61 | - Offsets must not be shared among multiple instances. 62 | - Each instance must always consume **all** partitions of the topic. 63 | 64 | The instances' UUID is also embedded in every message, so each instance can easily filter out messages it didn't 65 | produce. That's why it is perfectly fine to run multiple KMinion instances against the same cluster, using the same 66 | topic. 67 | 68 | KMinion also monitors and deletes consumer groups that use it's configured prefix. That way, when an instance 69 | exits/restarts, previous consumer groups will be cleaned up quickly (check happens every 20s). 70 | 71 | ## Available Metrics 72 | 73 | The end-to-end monitoring feature exports the following metrics. 74 | 75 | ### Counters 76 | 77 | | Name | Description | 78 | | --- | --- | 79 | | `kminion_end_to_end_messages_produced_total ` | Messages KMinion *tried* to send | 80 | | `kminion_end_to_end_messages_received_total ` | Number of messages received (only counts those that match, i.e. that this instance actually produced itself) | 81 | | `kminion_end_to_end_offset_commits_total` | Number of successful offset commits | 82 | | `kminion_end_to_end_messages_lost_total` Number of messages that have been produced successfully but not received within the configured SLA duration | 83 | | `kminion_end_to_end_messages_produced_failed_total` Number of messages failed to produce to Kafka because of a timeout or failure | 84 | | `kminion_end_to_end_offset_commits_total` Counts how many times kminions end-to-end test has committed offsets | 85 | 86 | ### Histograms 87 | 88 | | Name | Description | 89 | | --- | --- | 90 | | `kminion_end_to_end_produce_latency_seconds ` | Duration until the cluster acknowledged a message. | 91 | | `kminion_end_to_end_offset_commit_latency_seconds` Time kafka took to respond to kminion's offset commit | 92 | | `kminion_end_to_end_roundtrip_latency_seconds ` | Duration from creation of a message, until it was received/consumed again. | 93 | 94 | ### Gauges 95 | | Name | Description | 96 | | --- | --- | 97 | | `kminion_end_to_end_messages_produced_in_flight` Number of messages that kminion's end-to-end test produced but has not received an answer for yet | 98 | 99 | ## Config Properties 100 | 101 | All config properties related to this feature are located in `minion.endToEnd`. 102 | 103 | ```yaml 104 | endToEnd: 105 | enabled: true 106 | probeInterval: 800ms # how often to send end-to-end test messages 107 | topicManagement: 108 | # You can disable topic management, without disabling the testing feature. 109 | # Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic. 110 | # It is strongly recommended to leave this enabled. 111 | enabled: true 112 | 113 | # Name of the topic kminion uses to send its test messages 114 | # You do *not* need to change this if you are running multiple kminion instances on the same cluster. 115 | # Different instances are perfectly fine with sharing the same topic! 116 | name: kminion-end-to-end 117 | 118 | # How often kminion checks its topic to validate configuration, partition count, and partition assignments 119 | reconciliationInterval: 10m 120 | 121 | # Useful for monitoring the performance of acks (if >1 this is best combined with 'producer.requiredAcks' set to 'all') 122 | replicationFactor: 1 123 | 124 | # Rarely makes sense to change this, but maybe if you want some sort of cheap load test? 125 | partitionsPerBroker: 1 126 | 127 | producer: 128 | # This defines the maximum time to wait for an ack response after producing a message, 129 | # and the upper bound for histogram buckets in "produce_latency_seconds" 130 | ackSla: 5s 131 | # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if 132 | # the message was written to all in-sync replicas of the partition. 133 | # Or can be set to "leader" to only require to have written the message to its log. 134 | requiredAcks: all 135 | 136 | consumer: 137 | # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically 138 | groupIdPrefix: kminion-end-to-end 139 | 140 | # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want 141 | # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion. 142 | deleteStaleConsumerGroups: false 143 | 144 | # Defines the time limit beyond which a message is considered "lost" (failed the roundtrip), 145 | # also used as the upper bound for histogram buckets in "roundtrip_latency" 146 | roundtripSla: 20s 147 | 148 | # Maximum time an offset commit is allowed to take before considering it failed, 149 | # also used as the upper bound for histogram buckets in "commit_latency_seconds" 150 | commitSla: 10s 151 | ``` 152 | 153 | -------------------------------------------------------------------------------- /docs/metrics.md: -------------------------------------------------------------------------------- 1 | # Exported Metrics 2 | 3 | This document lists all exported metrics in an exemplary way. 4 | 5 | ## Exporter Metrics 6 | 7 | ``` 8 | # HELP kminion_exporter_up Build info about this Prometheus Exporter. Gauge value is 0 if one or more scrapes have failed. 9 | # TYPE kminion_exporter_up gauge 10 | kminion_exporter_up{version="sha-0ab0dcdf862f7a34b06998cd2d980148e048151a"} 1 11 | 12 | # HELP kminion_exporter_offset_consumer_records_consumed_total The number of offset records that have been consumed by the internal offset consumer 13 | # TYPE kminion_exporter_offset_consumer_records_consumed_total counter 14 | kminion_exporter_offset_consumer_records_consumed_total 5.058244883e+09 15 | ``` 16 | 17 | ## Kafka Metrics 18 | 19 | ### General / Cluster Metrics 20 | 21 | ``` 22 | # HELP kminion_kafka_broker_info Kafka broker information 23 | # TYPE kminion_kafka_broker_info gauge 24 | kminion_kafka_broker_info{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",is_controller="false",port="9092",rack_id="europe-west1-b"} 1 25 | 26 | # HELP kminion_kafka_cluster_info Kafka cluster information 27 | # TYPE kminion_kafka_cluster_info gauge 28 | kminion_kafka_cluster_info{broker_count="12",cluster_id="UYZJg8bhT_6SxhsdaQZEQ",cluster_version="v2.6",controller_id="6"} 1 29 | ``` 30 | 31 | ### Log Dir Metrics 32 | 33 | ``` 34 | # HELP kminion_kafka_broker_log_dir_size_total_bytes The summed size in bytes of all log dirs for a given broker 35 | # TYPE kminion_kafka_broker_log_dir_size_total_bytes gauge 36 | kminion_kafka_broker_log_dir_size_total_bytes{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",port="9092",rack_id="europe-west1-b"} 8.32654935115e+11 37 | 38 | # HELP kminion_kafka_topic_log_dir_size_total_bytes The summed size in bytes of partitions for a given topic. This includes the used space for replica partitions. 39 | # TYPE kminion_kafka_topic_log_dir_size_total_bytes gauge 40 | kminion_kafka_topic_log_dir_size_total_bytes{topic_name="__consumer_offsets"} 9.026554258e+09 41 | ``` 42 | 43 | ### Topic & Partition Metrics 44 | 45 | ``` 46 | # HELP kminion_kafka_topic_info Info labels for a given topic 47 | # TYPE kminion_kafka_topic_info gauge 48 | kminion_kafka_topic_info{cleanup_policy="compact",partition_count="1",replication_factor="1",topic_name="_confluent-ksql-default__command_topic"} 1 49 | 50 | # HELP kminion_kafka_topic_partition_low_water_mark Partition Low Water Mark 51 | # TYPE kminion_kafka_topic_partition_low_water_mark gauge 52 | kminion_kafka_topic_partition_low_water_mark{partition_id="0",topic_name="__consumer_offsets"} 0 53 | 54 | # HELP kminion_kafka_topic_low_water_mark_sum Sum of all the topic's partition low water marks 55 | # TYPE kminion_kafka_topic_low_water_mark_sum gauge 56 | kminion_kafka_topic_low_water_mark_sum{topic_name="__consumer_offsets"} 0 57 | 58 | # HELP kminion_kafka_topic_partition_high_water_mark Partition High Water Mark 59 | # TYPE kminion_kafka_topic_partition_high_water_mark gauge 60 | kminion_kafka_topic_partition_high_water_mark{partition_id="0",topic_name="__consumer_offsets"} 2.04952001e+08 61 | 62 | # HELP kminion_kafka_topic_high_water_mark_sum Sum of all the topic's partition high water marks 63 | # TYPE kminion_kafka_topic_high_water_mark_sum gauge 64 | kminion_kafka_topic_high_water_mark_sum{topic_name="__consumer_offsets"} 1.512023846873e+12 65 | ``` 66 | 67 | ### Consumer Group Metrics 68 | 69 | ``` 70 | # HELP kminion_kafka_consumer_group_info Consumer Group info metrics. It will report 1 if the group is in the stable state, otherwise 0. 71 | # TYPE kminion_kafka_consumer_group_info gauge 72 | kminion_kafka_consumer_group_info{coordinator_id="0",group_id="bigquery-sink",protocol="range",protocol_type="consumer",state="Stable"} 1 73 | 74 | # HELP kminion_kafka_consumer_group_members Consumer Group member count metrics. It will report the number of members in the consumer group 75 | # TYPE kminion_kafka_consumer_group_members gauge 76 | kminion_kafka_consumer_group_members{group_id="bigquery-sink"} 2 77 | 78 | # HELP kminion_kafka_consumer_group_empty_members Consumer Group Empty Members. It will report the number of members in the consumer group with no partition assigned 79 | # TYPE kminion_kafka_consumer_group_empty_members gauge 80 | kminion_kafka_consumer_group_empty_members{group_id="bigquery-sink"} 1 81 | 82 | # HELP kminion_kafka_consumer_group_topic_members Consumer Group topic member count metrics. It will report the number of members in the consumer group assigned on a given topic 83 | # TYPE kminion_kafka_consumer_group_topic_members gauge 84 | kminion_kafka_consumer_group_topic_members{group_id="bigquery-sink",topic_name="shop-activity"} 4 85 | 86 | # HELP kminion_kafka_consumer_group_topic_assigned_partitions Consumer Group topic partitions count metrics. It will report the number of partitions assigned in the consumer group for a given topic 87 | # TYPE kminion_kafka_consumer_group_topic_assigned_partitions gauge 88 | kminion_kafka_consumer_group_topic_assigned_partitions{group_id="bigquery-sink",topic_name="shop-activity"} 32 89 | 90 | # HELP kminion_kafka_consumer_group_topic_offset_sum The sum of all committed group offsets across all partitions in a topic 91 | # TYPE kminion_kafka_consumer_group_topic_offset_sum gauge 92 | kminion_kafka_consumer_group_topic_offset_sum{group_id="bigquery-sink",topic_name="shop-activity"} 4.259513e+06 93 | 94 | # HELP kminion_kafka_consumer_group_topic_partition_lag The number of messages a consumer group is lagging behind the latest offset of a partition 95 | # TYPE kminion_kafka_consumer_group_topic_partition_lag gauge 96 | kminion_kafka_consumer_group_topic_partition_lag{group_id="bigquery-sink",partition_id="10",topic_name="shop-activity"} 147481 97 | 98 | # HELP kminion_kafka_consumer_group_topic_lag The number of messages a consumer group is lagging behind across all partitions in a topic 99 | # TYPE kminion_kafka_consumer_group_topic_lag gauge 100 | kminion_kafka_consumer_group_topic_lag{group_id="bigquery-sink",topic_name="shop-activity"} 147481 101 | 102 | # HELP kminion_kafka_consumer_group_offset_commits_total The number of offsets committed by a group 103 | # TYPE kminion_kafka_consumer_group_offset_commits_total counter 104 | kminion_kafka_consumer_group_offset_commits_total{group_id="bigquery-sink"} 1098 105 | ``` 106 | 107 | ### End-to-End Metrics 108 | 109 | ``` 110 | # HELP kminion_end_to_end_messages_produced_total Number of messages that kminion's end-to-end test has tried to send to kafka 111 | # TYPE kminion_end_to_end_messages_produced_total counter 112 | kminion_end_to_end_messages_produced_total 384 113 | 114 | # HELP kminion_end_to_end_offset_commits_total Counts how many times kminions end-to-end test has committed messages 115 | # TYPE kminion_end_to_end_offset_commits_total counter 116 | kminion_end_to_end_offset_commits_total 18 117 | 118 | # HELP kminion_end_to_end_messages_received_total Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID) 119 | # TYPE kminion_end_to_end_messages_received_total counter 120 | kminion_end_to_end_messages_received_total 383 121 | 122 | # HELP kminion_end_to_end_produce_latency_seconds Time until we received an ack for a produced message 123 | # TYPE kminion_end_to_end_produce_latency_seconds histogram 124 | kminion_end_to_end_produce_latency_seconds_bucket{partitionId="0",le="0.005"} 0 125 | 126 | # HELP kminion_end_to_end_offset_commit_latency_seconds Time kafka took to respond to kminion's offset commit 127 | # TYPE kminion_end_to_end_offset_commit_latency_seconds histogram 128 | kminion_end_to_end_offset_commit_latency_seconds_bucket{groupCoordinatorBrokerId="0",le="0.005"} 0 129 | 130 | # HELP kminion_end_to_end_roundtrip_latency_seconds Time it took between sending (producing) and receiving (consuming) a message 131 | # TYPE kminion_end_to_end_roundtrip_latency_seconds histogram 132 | kminion_end_to_end_roundtrip_latency_seconds_bucket{partitionId="0",le="0.005"} 0 133 | 134 | # HELP kminion_end_to_end_messages_lost_total Number of messages that have been produced successfully but not received within the configured SLA duration 135 | # TYPE kminion_end_to_end_messages_lost_total counter 136 | kminion_end_to_end_messages_lost_total{partition_id="0"} 0 137 | 138 | # HELP kminion_end_to_end_messages_produced_failed_total Number of messages failed to produce to Kafka because of a timeout or failure 139 | # TYPE kminion_end_to_end_messages_produced_failed_total counter 140 | kminion_end_to_end_messages_produced_failed_total{partition_id="0"} 0 141 | 142 | # HELP kminion_end_to_end_messages_produced_in_flight Number of messages that kminion's end-to-end test produced but has not received an answer for yet 143 | # TYPE kminion_end_to_end_messages_produced_in_flight gauge 144 | kminion_end_to_end_messages_produced_in_flight{partition_id="0"} 0 145 | ``` 146 | -------------------------------------------------------------------------------- /docs/reference-config.yaml: -------------------------------------------------------------------------------- 1 | ##################################################################################### 2 | # This file documents all the available config options and it's default values. 3 | # 4 | # All config options can be configured via environment variables as well. 5 | # If you specify both the env variable and yaml option for the same configuration 6 | # the environment variable will take precedence. If you want to use a YAML config 7 | # file, specify the path to the config file by setting the env variable 8 | # CONFIG_FILEPATH. 9 | # 10 | # The env variable name is auto generated by upper casing everything and adding 11 | # an underscore for each indentation/level. Some examples: 12 | # kafka.rackId => KAFKA_RACKID 13 | # kafka.tls.caFilepath => KAFKA_TLS_CAFILEPATH 14 | # minion.consumerGroups.allowedGroups => MINION_CONSUMERGROUPS_ALLOWEDGROUPS 15 | # 16 | # Env variables that expect array values can be provided by separting them using 17 | # a comma: KAFKA_BROKERS = "broker1:9092,broker2:9092,broker3:9092" 18 | ##################################################################################### 19 | 20 | logger: 21 | # Valid values are: debug, info, warn, error, fatal, panic 22 | level: info 23 | 24 | kafka: 25 | brokers: [ ] 26 | clientId: "kminion" 27 | rackId: "" 28 | tls: 29 | enabled: false 30 | caFilepath: "" 31 | certFilepath: "" 32 | keyFilepath: "" 33 | # base64 encoded tls CA, cannot be set if 'caFilepath' is set 34 | ca: "" 35 | # base64 encoded tls cert, cannot be set if 'certFilepath' is set 36 | cert: "" 37 | # base64 encoded tls key, cannot be set if 'keyFilepath' is set 38 | key: "" 39 | passphrase: "" 40 | insecureSkipTlsVerify: false 41 | 42 | sasl: 43 | # Whether or not SASL authentication will be used for authentication 44 | enabled: false 45 | # Username to use for PLAIN or SCRAM mechanism 46 | username: "" 47 | # Password to use for PLAIN or SCRAM mechanism 48 | password: "" 49 | # Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI, OAUTHBEARER 50 | mechanism: "PLAIN" 51 | # GSSAPI / Kerberos config properties 52 | gssapi: 53 | authType: "" 54 | keyTabPath: "" 55 | kerberosConfigPath: "" 56 | serviceName: "" 57 | username: "" 58 | password: "" 59 | realm: "" 60 | enableFast: true 61 | # OAUTHBEARER config properties 62 | oauth: 63 | tokenEndpoint: "" 64 | clientId: "" 65 | clientSecret: "" 66 | scope: "" 67 | 68 | minion: 69 | consumerGroups: 70 | # Enabled specifies whether consumer groups shall be scraped and exported or not. 71 | enabled: true 72 | # Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal 73 | # __consumer_offsets topic. Both modes have their advantages and disadvantages. 74 | # * adminApi: 75 | # - Useful for managed kafka clusters that do not provide access to the offsets topic. 76 | # * offsetsTopic 77 | # - Enables kminion_kafka_consumer_group_offset_commits_total metrics. 78 | # - Processing the offsetsTopic requires slightly more memory and cpu than using the adminApi. The amount depends on the 79 | # size and throughput of the offsets topic. 80 | scrapeMode: adminApi # Valid values: adminApi, offsetsTopic 81 | # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and 82 | # you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed 83 | # and only topic lags will be exported. 84 | granularity: partition 85 | # AllowedGroups are regex strings of group ids that shall be exported 86 | # You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions 87 | # like "/internal-.*/". 88 | allowedGroups: [ ".*" ] 89 | # IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups 90 | # take precedence over allowed groups. 91 | ignoredGroups: [ ] 92 | topics: 93 | # Enabled can be set to false in order to disable collecting any topic metrics. 94 | enabled: true 95 | # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and 96 | # you aren't interested in per partition metrics you could choose "topic". 97 | granularity: partition 98 | # AllowedTopics are regex strings of topic names whose topic metrics that shall be exported. 99 | # You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions 100 | # like "/internal-.*/". 101 | allowedTopics: [ ".*" ] 102 | # IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics 103 | # take precedence over allowed topics. 104 | ignoredTopics: [ ] 105 | # infoMetric is a configuration object for the kminion_kafka_topic_info metric 106 | infoMetric: 107 | # ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric 108 | configKeys: [ "cleanup.policy" ] 109 | logDirs: 110 | # Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior 111 | # to version 1.0.0 as describing log dirs was not supported back then. 112 | enabled: true 113 | 114 | # EndToEnd Metrics 115 | # When enabled, kminion creates a topic which it produces to and consumes from, to measure various advanced metrics. See docs for more info 116 | endToEnd: 117 | enabled: false 118 | # How often to send end-to-end test messages 119 | probeInterval: 100ms 120 | topicManagement: 121 | # You can disable topic management, without disabling the testing feature. 122 | # Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic 123 | enabled: true 124 | 125 | # Name of the topic kminion uses to send its test messages 126 | # You do *not* need to change this if you are running multiple kminion instances on the same cluster. 127 | # Different instances are perfectly fine with sharing the same topic! 128 | name: kminion-end-to-end 129 | 130 | # How often kminion checks its topic to validate configuration, partition count, and partition assignments 131 | reconciliationInterval: 10m 132 | 133 | # Depending on the desired monitoring (e.g. you want to alert on broker failure vs. cluster that is not writable) 134 | # you may choose replication factor 1 or 3 most commonly. 135 | replicationFactor: 1 136 | 137 | # Rarely makes sense to change this, but maybe if you want some sort of cheap load test? 138 | # By default (1) every broker gets one partition 139 | partitionsPerBroker: 1 140 | 141 | producer: 142 | # This defines: 143 | # - Maximum time to wait for an ack response after producing a message 144 | # - Upper bound for histogram buckets in "produce_latency_seconds" 145 | ackSla: 5s 146 | # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if 147 | # the message was written to all in-sync replicas of the partition. 148 | # Or can be set to "leader" to only require to have written the message to its log. 149 | requiredAcks: all 150 | 151 | consumer: 152 | # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically 153 | groupIdPrefix: kminion-end-to-end 154 | 155 | # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want 156 | # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion. 157 | deleteStaleConsumerGroups: false 158 | 159 | # This defines: 160 | # - Upper bound for histogram buckets in "roundtrip_latency" 161 | # - Time limit beyond which a message is considered "lost" (failed the roundtrip) 162 | roundtripSla: 20s 163 | 164 | # - Upper bound for histogram buckets in "commit_latency_seconds" 165 | # - Maximum time an offset commit is allowed to take before considering it failed 166 | commitSla: 10s 167 | 168 | exporter: 169 | # Namespace is the prefix for all exported Prometheus metrics 170 | namespace: "kminion" 171 | # Host that shall be used to bind the HTTP server on 172 | host: "" 173 | # Port that shall be used to bind the HTTP server on 174 | port: 8080 175 | -------------------------------------------------------------------------------- /docs/screenshots/kminion-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-cluster.png -------------------------------------------------------------------------------- /docs/screenshots/kminion-groups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-groups.png -------------------------------------------------------------------------------- /docs/screenshots/kminion-topics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/kminion/ea8e76a9b2f2f2e8888bc239261f79d1a33d3967/docs/screenshots/kminion-topics.png -------------------------------------------------------------------------------- /e2e/client_hooks.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "net" 5 | "sync/atomic" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kgo" 9 | "github.com/twmb/franz-go/pkg/kmsg" 10 | "go.uber.org/zap" 11 | ) 12 | 13 | // in e2e we only use client hooks for logging connect/disconnect messages 14 | type clientHooks struct { 15 | logger *zap.Logger 16 | 17 | lastCoordinatorUpdate time.Time 18 | currentCoordinator *atomic.Value // kgo.BrokerMetadata 19 | } 20 | 21 | func newEndToEndClientHooks(logger *zap.Logger) *clientHooks { 22 | return &clientHooks{ 23 | logger: logger.Named("e2e_hooks"), 24 | currentCoordinator: &atomic.Value{}, 25 | } 26 | } 27 | 28 | func (c *clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) { 29 | if err != nil { 30 | c.logger.Error("kafka connection failed", zap.String("broker_host", meta.Host), zap.Int32("broker_id", meta.NodeID), zap.Error(err)) 31 | return 32 | } 33 | c.logger.Debug("kafka connection succeeded", 34 | zap.String("host", meta.Host), zap.Int32("broker_id", meta.NodeID), 35 | zap.Int64("dial_duration_ms", dialDur.Milliseconds())) 36 | } 37 | 38 | func (c *clientHooks) OnDisconnect(meta kgo.BrokerMetadata, _ net.Conn) { 39 | c.logger.Warn("kafka broker disconnected", zap.Int32("broker_id", meta.NodeID), 40 | zap.String("host", meta.Host)) 41 | } 42 | 43 | // OnBrokerWrite is passed the broker metadata, the key for the request that 44 | // was written, the number of bytes written, how long the request 45 | // waited before being written, how long it took to write the request, 46 | // and any error. 47 | // 48 | // The bytes written does not count any tls overhead. 49 | // OnWrite is called after a write to a broker. 50 | // 51 | // OnWrite(meta BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) 52 | func (c *clientHooks) OnBrokerWrite(meta kgo.BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) { 53 | keyName := kmsg.NameForKey(key) 54 | if keyName != "OffsetCommit" { 55 | return 56 | } 57 | 58 | // c.logger.Info("hooks onWrite", 59 | // zap.Duration("timeToWrite", timeToWrite), 60 | // zap.NamedError("err", err)) 61 | } 62 | 63 | // OnBrokerRead is passed the broker metadata, the key for the response that 64 | // was read, the number of bytes read, how long the Client waited 65 | // before reading the response, how long it took to read the response, 66 | // and any error. 67 | // 68 | // The bytes written does not count any tls overhead. 69 | // OnRead is called after a read from a broker. 70 | // OnRead(meta BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) 71 | func (c *clientHooks) OnBrokerRead(meta kgo.BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) { 72 | consumerGroupMsgKeys := []int16{ 73 | (&kmsg.OffsetCommitResponse{}).Key(), 74 | (&kmsg.JoinGroupResponse{}).Key(), 75 | (&kmsg.HeartbeatResponse{}).Key(), 76 | (&kmsg.SyncGroupResponse{}).Key(), 77 | } 78 | 79 | isMessageFromGroupCoordinator := isInArray(key, consumerGroupMsgKeys) 80 | if !isMessageFromGroupCoordinator { 81 | return 82 | } 83 | 84 | if err == nil { 85 | c.currentCoordinator.Store(meta) 86 | c.lastCoordinatorUpdate = time.Now() 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /e2e/config.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | type Config struct { 9 | Enabled bool `koanf:"enabled"` 10 | TopicManagement EndToEndTopicConfig `koanf:"topicManagement"` 11 | ProbeInterval time.Duration `koanf:"probeInterval"` 12 | Producer EndToEndProducerConfig `koanf:"producer"` 13 | Consumer EndToEndConsumerConfig `koanf:"consumer"` 14 | } 15 | 16 | func (c *Config) SetDefaults() { 17 | c.Enabled = false 18 | c.ProbeInterval = 100 * time.Millisecond 19 | c.TopicManagement.SetDefaults() 20 | c.Producer.SetDefaults() 21 | c.Consumer.SetDefaults() 22 | } 23 | 24 | func (c *Config) Validate() error { 25 | 26 | if !c.Enabled { 27 | return nil 28 | } 29 | 30 | // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0 31 | if c.ProbeInterval == 0 { 32 | return fmt.Errorf("failed to validate probeInterval config, the duration can't be zero") 33 | } 34 | 35 | err := c.TopicManagement.Validate() 36 | if err != nil { 37 | return fmt.Errorf("failed to validate topicManagement config: %w", err) 38 | } 39 | 40 | _, err = time.ParseDuration(c.ProbeInterval.String()) 41 | if err != nil { 42 | return fmt.Errorf("failed to parse '%s' to time.Duration: %v", c.ProbeInterval.String(), err) 43 | } 44 | 45 | err = c.Producer.Validate() 46 | if err != nil { 47 | return fmt.Errorf("failed to validate producer config: %w", err) 48 | } 49 | 50 | err = c.Consumer.Validate() 51 | if err != nil { 52 | return fmt.Errorf("failed to validate consumer config: %w", err) 53 | } 54 | 55 | return nil 56 | } 57 | -------------------------------------------------------------------------------- /e2e/config_consumer.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | type EndToEndConsumerConfig struct { 9 | GroupIdPrefix string `koanf:"groupIdPrefix"` 10 | DeleteStaleConsumerGroups bool `koanf:"deleteStaleConsumerGroups"` 11 | 12 | // RoundtripSLA is the time duration from the moment where we try to produce until the moment where we consumed 13 | // the message. Therefore this should always be higher than the produceTimeout / SLA. 14 | RoundtripSla time.Duration `koanf:"roundtripSla"` 15 | CommitSla time.Duration `koanf:"commitSla"` 16 | } 17 | 18 | func (c *EndToEndConsumerConfig) SetDefaults() { 19 | c.GroupIdPrefix = "kminion-end-to-end" 20 | c.DeleteStaleConsumerGroups = false 21 | c.RoundtripSla = 20 * time.Second 22 | c.CommitSla = 5 * time.Second 23 | } 24 | 25 | func (c *EndToEndConsumerConfig) Validate() error { 26 | if len(c.GroupIdPrefix) < 3 { 27 | return fmt.Errorf("kminion prefix should be at least 3 characters long") 28 | } 29 | 30 | if c.RoundtripSla <= 0 { 31 | return fmt.Errorf("consumer.roundtripSla must be greater than zero") 32 | } 33 | 34 | if c.CommitSla <= 0 { 35 | return fmt.Errorf("consumer.commitSla must be greater than zero") 36 | } 37 | 38 | return nil 39 | } 40 | -------------------------------------------------------------------------------- /e2e/config_producer.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | type EndToEndProducerConfig struct { 9 | AckSla time.Duration `koanf:"ackSla"` 10 | RequiredAcks string `koanf:"requiredAcks"` 11 | } 12 | 13 | func (c *EndToEndProducerConfig) SetDefaults() { 14 | c.AckSla = 5 * time.Second 15 | c.RequiredAcks = "all" 16 | } 17 | 18 | func (c *EndToEndProducerConfig) Validate() error { 19 | 20 | if c.RequiredAcks != "all" && c.RequiredAcks != "leader" { 21 | return fmt.Errorf("producer.requiredAcks must be 'all' or 'leader") 22 | } 23 | 24 | if c.AckSla <= 0 { 25 | return fmt.Errorf("producer.ackSla must be greater than zero") 26 | } 27 | 28 | return nil 29 | } 30 | -------------------------------------------------------------------------------- /e2e/config_topic.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | type EndToEndTopicConfig struct { 9 | Enabled bool `koanf:"enabled"` 10 | Name string `koanf:"name"` 11 | ReplicationFactor int `koanf:"replicationFactor"` 12 | PartitionsPerBroker int `koanf:"partitionsPerBroker"` 13 | ReconciliationInterval time.Duration `koanf:"reconciliationInterval"` 14 | } 15 | 16 | func (c *EndToEndTopicConfig) SetDefaults() { 17 | c.Enabled = true 18 | c.Name = "kminion-end-to-end" 19 | c.ReplicationFactor = 1 20 | c.PartitionsPerBroker = 1 21 | c.ReconciliationInterval = 10 * time.Minute 22 | } 23 | 24 | func (c *EndToEndTopicConfig) Validate() error { 25 | 26 | if c.ReplicationFactor < 1 { 27 | return fmt.Errorf("failed to parse replicationFactor, it should be more than 1, retrieved value %v", c.ReplicationFactor) 28 | } 29 | 30 | if c.PartitionsPerBroker < 1 { 31 | return fmt.Errorf("failed to parse partitionsPerBroker, it should be more than 1, retrieved value %v", c.ReplicationFactor) 32 | } 33 | 34 | // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0 35 | if c.ReconciliationInterval == 0 { 36 | return fmt.Errorf("failed to validate topic.ReconciliationInterval config, the duration can't be zero") 37 | } 38 | 39 | return nil 40 | } 41 | -------------------------------------------------------------------------------- /e2e/consumer.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | func (s *Service) startConsumeMessages(ctx context.Context, initializedCh chan<- bool) { 15 | client := s.client 16 | 17 | s.logger.Info("starting to consume end-to-end topic", 18 | zap.String("topic_name", s.config.TopicManagement.Name), 19 | zap.String("group_id", s.groupId)) 20 | 21 | isInitialized := false 22 | for { 23 | fetches := client.PollFetches(ctx) 24 | if !isInitialized { 25 | isInitialized = true 26 | initializedCh <- true 27 | close(initializedCh) 28 | } 29 | 30 | // Log all errors and continue afterwards as we might get errors and still have some fetch results 31 | errors := fetches.Errors() 32 | for _, err := range errors { 33 | s.logger.Error("kafka fetch error", 34 | zap.String("topic", err.Topic), 35 | zap.Int32("partition", err.Partition), 36 | zap.Error(err.Err)) 37 | } 38 | 39 | fetches.EachRecord(s.processMessage) 40 | } 41 | } 42 | 43 | func (s *Service) commitOffsets(ctx context.Context) { 44 | client := s.client 45 | uncommittedOffset := client.UncommittedOffsets() 46 | if uncommittedOffset == nil { 47 | return 48 | } 49 | 50 | startCommitTimestamp := time.Now() 51 | 52 | childCtx, cancel := context.WithTimeout(ctx, s.config.Consumer.CommitSla) 53 | client.CommitOffsets(childCtx, uncommittedOffset, func(_ *kgo.Client, req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) { 54 | cancel() 55 | 56 | coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata) 57 | coordinatorID := strconv.Itoa(int(coordinator.NodeID)) 58 | 59 | latency := time.Since(startCommitTimestamp) 60 | s.offsetCommitLatency.WithLabelValues(coordinatorID).Observe(latency.Seconds()) 61 | s.offsetCommitsTotal.WithLabelValues(coordinatorID).Inc() 62 | // We do this to ensure that a series with that coordinator id is initialized 63 | s.offsetCommitsTotal.WithLabelValues(coordinatorID).Add(0) 64 | 65 | // If we have at least one error in our commit response we want to report it as an error with an appropriate 66 | // reason as label. 67 | if errCode := s.logCommitErrors(r, err); errCode != "" { 68 | s.offsetCommitsFailedTotal.WithLabelValues(coordinatorID, errCode).Inc() 69 | return 70 | } 71 | }) 72 | } 73 | 74 | // processMessage: 75 | // - deserializes the message 76 | // - checks if it is from us, or from another kminion process running somewhere else 77 | // - hands it off to the service, which then reports metrics on it 78 | func (s *Service) processMessage(record *kgo.Record) { 79 | if record.Value == nil { 80 | // Init messages have nil values - we want to skip these. They are only used to make sure a consumer is ready. 81 | return 82 | } 83 | 84 | var msg EndToEndMessage 85 | if jerr := json.Unmarshal(record.Value, &msg); jerr != nil { 86 | s.logger.Error("failed to unmarshal message value", zap.Error(jerr)) 87 | return // maybe older version 88 | } 89 | 90 | if msg.MinionID != s.minionID { 91 | return // not from us 92 | } 93 | 94 | // restore partition, which is not serialized 95 | msg.partition = int(record.Partition) 96 | s.messageTracker.onMessageArrived(&msg) 97 | } 98 | -------------------------------------------------------------------------------- /e2e/endtoend_message.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import "time" 4 | 5 | const ( 6 | _ = iota 7 | EndToEndMessageStateCreated 8 | EndToEndMessageStateProducedSuccessfully 9 | ) 10 | 11 | type EndToEndMessage struct { 12 | MinionID string `json:"minionID"` // unique for each running kminion instance 13 | MessageID string `json:"messageID"` // unique for each message 14 | Timestamp int64 `json:"createdUtcNs"` // when the message was created, unix nanoseconds 15 | 16 | // The following properties are only used within the message tracker 17 | partition int 18 | state int 19 | produceLatency float64 20 | } 21 | 22 | func (m *EndToEndMessage) creationTime() time.Time { 23 | return time.Unix(0, m.Timestamp) 24 | } 25 | -------------------------------------------------------------------------------- /e2e/group_tracker.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "strings" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kerr" 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | const ( 15 | oldGroupCheckInterval = 5 * time.Second // how often to check for old kminion groups 16 | oldGroupMaxAge = 20 * time.Second // maximum age after which an old group should be deleted 17 | ) 18 | 19 | // groupTracker keeps checking for empty consumerGroups matching the kminion prefix. 20 | // When a group was seen empty for some time, we delete it. 21 | // Why? 22 | // Whenever a kminion instance starts up it creates a consumer-group for itself in order to not "collide" with other kminion instances. 23 | // When an instance restarts (for whatever reason), it creates a new group again, so we'd end up with a lot of unused groups. 24 | type groupTracker struct { 25 | cfg Config 26 | logger *zap.Logger 27 | client *kgo.Client // kafka client 28 | groupId string // our own groupId 29 | potentiallyEmptyGroups map[string]time.Time // groupName -> utc timestamp when the group was first seen 30 | } 31 | 32 | func newGroupTracker(cfg Config, logger *zap.Logger, client *kgo.Client, groupID string) *groupTracker { 33 | return &groupTracker{ 34 | cfg: cfg, 35 | logger: logger.Named("group_tracker"), 36 | client: client, 37 | groupId: groupID, 38 | potentiallyEmptyGroups: make(map[string]time.Time), 39 | } 40 | } 41 | 42 | func (g *groupTracker) start(ctx context.Context) { 43 | g.logger.Debug("starting group tracker") 44 | 45 | deleteOldGroupsTicker := time.NewTicker(oldGroupCheckInterval) 46 | for { 47 | select { 48 | case <-ctx.Done(): 49 | g.logger.Debug("stopping group tracker, context was cancelled") 50 | return 51 | case <-deleteOldGroupsTicker.C: 52 | childCtx, cancel := context.WithTimeout(ctx, 10*time.Second) 53 | err := g.checkAndDeleteOldConsumerGroups(childCtx) 54 | if err != nil { 55 | g.logger.Error("failed to check for old consumer groups: %w", zap.Error(err)) 56 | } 57 | cancel() 58 | } 59 | } 60 | } 61 | 62 | func (g *groupTracker) checkAndDeleteOldConsumerGroups(ctx context.Context) error { 63 | groupsRq := kmsg.NewListGroupsRequest() 64 | groupsRq.StatesFilter = []string{"Empty"} 65 | 66 | g.logger.Debug("checking for stale kminion consumer groups") 67 | 68 | shardedResponse := g.client.RequestSharded(ctx, &groupsRq) 69 | 70 | // find groups that start with the kminion prefix 71 | matchingGroups := make([]string, 0) 72 | for _, shard := range shardedResponse { 73 | if shard.Err != nil { 74 | g.logger.Error("error in response to ListGroupsRequest", zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err)) 75 | continue 76 | } 77 | 78 | r, ok := shard.Resp.(*kmsg.ListGroupsResponse) 79 | if !ok { 80 | g.logger.Error("cannot cast responseShard.Resp to kmsg.ListGroupsResponse") 81 | continue 82 | } 83 | 84 | for _, group := range r.Groups { 85 | name := group.Group 86 | 87 | if name == g.groupId { 88 | continue // skip our own consumer group 89 | } 90 | 91 | if strings.HasPrefix(name, g.cfg.Consumer.GroupIdPrefix) { 92 | matchingGroups = append(matchingGroups, name) 93 | } 94 | } 95 | } 96 | 97 | // save new (previously unseen) groups to tracker 98 | g.logger.Debug("checked for stale consumer groups", zap.Int("found_groups", len(matchingGroups)), zap.Strings("groups", matchingGroups)) 99 | for _, name := range matchingGroups { 100 | _, exists := g.potentiallyEmptyGroups[name] 101 | if !exists { 102 | // add it with the current timestamp 103 | g.potentiallyEmptyGroups[name] = time.Now() 104 | g.logger.Debug("found new empty kminion group, adding it to the tracker", zap.String("group", name)) 105 | } 106 | } 107 | 108 | // go through saved groups: 109 | // - don't track the ones we don't see anymore (bc they got deleted or are not empty anymore) 110 | // - mark the ones that are too old (have been observed as empty for too long) 111 | groupsToDelete := make([]string, 0) 112 | for name, firstSeen := range g.potentiallyEmptyGroups { 113 | exists, _ := containsStr(matchingGroups, name) 114 | if exists { 115 | // still there, check age and maybe delete it 116 | age := time.Since(firstSeen) 117 | if age > oldGroupMaxAge { 118 | // group was unused for too long, delete it 119 | groupsToDelete = append(groupsToDelete, name) 120 | delete(g.potentiallyEmptyGroups, name) 121 | } 122 | } else { 123 | // does not exist anymore, it must have been deleted, or is in use now (no longer empty) 124 | // don't track it anymore 125 | delete(g.potentiallyEmptyGroups, name) 126 | } 127 | } 128 | 129 | // actually delete the groups we've decided to delete 130 | if len(groupsToDelete) == 0 { 131 | return nil 132 | } 133 | 134 | deleteRq := kmsg.NewDeleteGroupsRequest() 135 | deleteRq.Groups = groupsToDelete 136 | deleteResp := g.client.RequestSharded(ctx, &deleteRq) 137 | 138 | // done, now just errors 139 | // if we get a not authorized error we'll disable deleting groups 140 | foundNotAuthorizedError := false 141 | deletedGroups := make([]string, 0) 142 | for _, shard := range deleteResp { 143 | if shard.Err != nil { 144 | g.logger.Error("sharded consumer group delete request failed", zap.Error(shard.Err)) 145 | continue 146 | } 147 | 148 | resp, ok := shard.Resp.(*kmsg.DeleteGroupsResponse) 149 | if !ok { 150 | g.logger.Error("failed to cast shard response to DeleteGroupsResponse while handling an error for deleting groups", zap.String("shard_host", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err)) 151 | continue 152 | } 153 | 154 | for _, groupResp := range resp.Groups { 155 | err := kerr.ErrorForCode(groupResp.ErrorCode) 156 | if err != nil { 157 | g.logger.Error("failed to delete consumer group", zap.String("shard", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.String("group", groupResp.Group), zap.Error(err)) 158 | 159 | if groupResp.ErrorCode == kerr.GroupAuthorizationFailed.Code { 160 | foundNotAuthorizedError = true 161 | } 162 | 163 | } else { 164 | deletedGroups = append(deletedGroups, groupResp.Group) 165 | } 166 | } 167 | } 168 | g.logger.Info("deleted old consumer groups", zap.Strings("deleted_groups", deletedGroups)) 169 | 170 | if foundNotAuthorizedError { 171 | g.logger.Info("disabling trying to delete old kminion consumer-groups since one of the last delete results had an 'GroupAuthorizationFailed' error") 172 | } 173 | 174 | return nil 175 | } 176 | -------------------------------------------------------------------------------- /e2e/message_tracker.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "time" 7 | 8 | "github.com/jellydator/ttlcache/v2" 9 | 10 | "go.uber.org/zap" 11 | ) 12 | 13 | // messageTracker keeps track of the messages' lifetime 14 | // 15 | // When we successfully send a mesasge, it will be added to this tracker. 16 | // Later, when we receive the message back in the consumer, the message is marked as completed and removed from the tracker. 17 | // If the message does not arrive within the configured `consumer.roundtripSla`, it is counted as lost. Messages that 18 | // failed to be produced will not be 19 | // considered as lost message. 20 | // 21 | // We use a dedicated counter to track messages that couldn't be produced to Kafka. 22 | type messageTracker struct { 23 | svc *Service 24 | logger *zap.Logger 25 | cache *ttlcache.Cache 26 | } 27 | 28 | func newMessageTracker(svc *Service) *messageTracker { 29 | defaultExpirationDuration := svc.config.Consumer.RoundtripSla 30 | cache := ttlcache.NewCache() 31 | cache.SetTTL(defaultExpirationDuration) 32 | 33 | t := &messageTracker{ 34 | svc: svc, 35 | logger: svc.logger.Named("message_tracker"), 36 | cache: cache, 37 | } 38 | t.cache.SetExpirationReasonCallback(func(key string, reason ttlcache.EvictionReason, value interface{}) { 39 | t.onMessageExpired(key, reason, value.(*EndToEndMessage)) 40 | }) 41 | 42 | return t 43 | } 44 | 45 | func (t *messageTracker) addToTracker(msg *EndToEndMessage) { 46 | t.cache.Set(msg.MessageID, msg) 47 | } 48 | 49 | // updateItemIfExists only updates a message if it still exists in the cache. The remaining time to live will not 50 | // be refreshed. 51 | // If it doesn't exist an ttlcache.ErrNotFound error will be returned. 52 | func (t *messageTracker) updateItemIfExists(msg *EndToEndMessage) error { 53 | _, ttl, err := t.cache.GetWithTTL(msg.MessageID) 54 | if err != nil { 55 | if err == ttlcache.ErrNotFound { 56 | return err 57 | } 58 | panic(err) 59 | } 60 | 61 | // Because the returned TTL is set to the original TTL duration (and not the remaining TTL) we have to calculate 62 | // the remaining TTL now as we want to updat the existing cache item without changing the remaining time to live. 63 | expiryTimestamp := msg.creationTime().Add(ttl) 64 | remainingTTL := expiryTimestamp.Sub(time.Now()) 65 | if remainingTTL < 0 { 66 | // This entry should have been deleted already. Race condition. 67 | return ttlcache.ErrNotFound 68 | } 69 | 70 | err = t.cache.SetWithTTL(msg.MessageID, msg, remainingTTL) 71 | if err != nil { 72 | panic(err) 73 | } 74 | 75 | return nil 76 | } 77 | 78 | // removeFromTracker removes an entry from the cache. If the key does not exist it will return an ttlcache.ErrNotFound error. 79 | func (t *messageTracker) removeFromTracker(messageID string) error { 80 | return t.cache.Remove(messageID) 81 | } 82 | 83 | func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMessage) { 84 | cm, err := t.cache.Get(arrivedMessage.MessageID) 85 | if err != nil { 86 | if err == ttlcache.ErrNotFound { 87 | // message expired and was removed from the cache 88 | // it arrived too late, nothing to do here... 89 | return 90 | } else { 91 | panic(fmt.Errorf("failed to get message from cache: %w", err)) 92 | } 93 | } 94 | 95 | msg := cm.(*EndToEndMessage) 96 | 97 | expireTime := msg.creationTime().Add(t.svc.config.Consumer.RoundtripSla) 98 | isExpired := time.Now().Before(expireTime) 99 | latency := time.Now().Sub(msg.creationTime()) 100 | 101 | if !isExpired { 102 | // Message arrived late, but was still in cache. We don't increment the lost counter here because eventually 103 | // it will be evicted from the cache. This case should only pop up if the sla time is exceeded, but if the 104 | // item has not been evicted from the cache yet. 105 | t.logger.Info("message arrived late, will be marked as a lost message", 106 | zap.Int64("delay_ms", latency.Milliseconds()), 107 | zap.String("id", msg.MessageID)) 108 | return 109 | } 110 | 111 | // message arrived early enough 112 | pID := strconv.Itoa(msg.partition) 113 | t.svc.messagesReceived.WithLabelValues(pID).Inc() 114 | t.svc.roundtripLatency.WithLabelValues(pID).Observe(latency.Seconds()) 115 | 116 | // Remove message from cache, so that we don't track it any longer and won't mark it as lost when the entry expires. 117 | t.cache.Remove(msg.MessageID) 118 | } 119 | 120 | func (t *messageTracker) onMessageExpired(_ string, reason ttlcache.EvictionReason, value interface{}) { 121 | if reason == ttlcache.Removed { 122 | // We are not interested in messages that have been removed by us! 123 | return 124 | } 125 | 126 | msg := value.(*EndToEndMessage) 127 | 128 | created := msg.creationTime() 129 | age := time.Since(created) 130 | t.svc.lostMessages.WithLabelValues(strconv.Itoa(msg.partition)).Inc() 131 | 132 | t.logger.Debug("message expired/lost", 133 | zap.Int64("age_ms", age.Milliseconds()), 134 | zap.Int("partition", msg.partition), 135 | zap.String("message_id", msg.MessageID), 136 | zap.Bool("successfully_produced", msg.state == EndToEndMessageStateProducedSuccessfully), 137 | zap.Float64("produce_latency_seconds", msg.produceLatency), 138 | ) 139 | } 140 | -------------------------------------------------------------------------------- /e2e/producer.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/google/uuid" 10 | "github.com/twmb/franz-go/pkg/kgo" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | // produceMessagesToAllPartitions sends an EndToEndMessage to every partition on the given topic 15 | func (s *Service) produceMessagesToAllPartitions(ctx context.Context) { 16 | for i := 0; i < s.partitionCount; i++ { 17 | s.produceMessage(ctx, i) 18 | } 19 | } 20 | 21 | // produceMessage produces an end to end record to a single given partition. If it succeeds producing the record 22 | // it will add it to the message tracker. If producing fails a message will be logged and the respective metrics 23 | // will be incremented. 24 | func (s *Service) produceMessage(ctx context.Context, partition int) { 25 | topicName := s.config.TopicManagement.Name 26 | record, msg := createEndToEndRecord(s.minionID, topicName, partition) 27 | 28 | startTime := time.Now() 29 | 30 | // This childCtx will ensure that we will abort our efforts to produce (including retries) when we exceed 31 | // the SLA for producers. 32 | childCtx, cancel := context.WithTimeout(ctx, s.config.Producer.AckSla+2*time.Second) 33 | 34 | pID := strconv.Itoa(partition) 35 | s.messagesProducedInFlight.WithLabelValues(pID).Inc() 36 | s.messageTracker.addToTracker(msg) 37 | s.client.TryProduce(childCtx, record, func(r *kgo.Record, err error) { 38 | defer cancel() 39 | ackDuration := time.Since(startTime) 40 | s.messagesProducedInFlight.WithLabelValues(pID).Dec() 41 | s.messagesProducedTotal.WithLabelValues(pID).Inc() 42 | // We add 0 in order to ensure that the "failed" metric series for that partition id are initialized as well. 43 | s.messagesProducedFailed.WithLabelValues(pID).Add(0) 44 | s.lostMessages.WithLabelValues(pID).Add(0) 45 | 46 | if err != nil { 47 | s.messagesProducedFailed.WithLabelValues(pID).Inc() 48 | _ = s.messageTracker.removeFromTracker(msg.MessageID) 49 | 50 | s.logger.Info("failed to produce message to end-to-end topic", 51 | zap.String("topic_name", r.Topic), 52 | zap.Int32("partition", r.Partition), 53 | zap.Error(err)) 54 | return 55 | } else { 56 | // Update the message's state. If this message expires and is marked as successfully produced we will 57 | // report this as a lost message, which would indicate that the producer was told that the message got 58 | // produced successfully, but it got lost somewhere. 59 | // We need to use updateItemIfExists() because it's possible that the message has already been consumed 60 | // before we have received the message here (because we were awaiting the produce ack). 61 | msg.state = EndToEndMessageStateProducedSuccessfully 62 | msg.produceLatency = ackDuration.Seconds() 63 | 64 | // TODO: Enable again as soon as https://github.com/ReneKroon/ttlcache/issues/60 is fixed 65 | // Because we cannot update cache items in an atomic fashion we currently can't use this method 66 | // as this would cause a race condition which ends up in records being reported as lost/expired. 67 | // s.messageTracker.updateItemIfExists(msg) 68 | } 69 | 70 | s.produceLatency.WithLabelValues(pID).Observe(ackDuration.Seconds()) 71 | }) 72 | } 73 | 74 | func createEndToEndRecord(minionID string, topicName string, partition int) (*kgo.Record, *EndToEndMessage) { 75 | message := &EndToEndMessage{ 76 | MinionID: minionID, 77 | MessageID: uuid.NewString(), 78 | Timestamp: time.Now().UnixNano(), 79 | 80 | partition: partition, 81 | state: EndToEndMessageStateCreated, 82 | } 83 | 84 | mjson, err := json.Marshal(message) 85 | if err != nil { 86 | // Should never happen since the struct is so simple, 87 | // but if it does, something is completely broken anyway 88 | panic("cannot serialize EndToEndMessage") 89 | } 90 | 91 | record := &kgo.Record{ 92 | Topic: topicName, 93 | Value: mjson, 94 | Partition: int32(partition), // we set partition for producing so our customPartitioner can make use of it 95 | } 96 | 97 | return record, message 98 | } 99 | -------------------------------------------------------------------------------- /e2e/topic_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "github.com/twmb/franz-go/pkg/kmsg" 6 | "sort" 7 | "testing" 8 | ) 9 | 10 | func TestCalculateAppropriateReplicas(t *testing.T) { 11 | tt := []struct { 12 | TestName string 13 | Brokers []kmsg.MetadataResponseBroker 14 | ReplicationFactor int 15 | LeaderBroker kmsg.MetadataResponseBroker 16 | 17 | // Some cases may have more than one possible solution, each entry in the outer array covers one allowed 18 | // solution. The compared int32 array order does not matter, except for the very first item as this indicates 19 | // the preferred leader. For example if you use {2, 0, 1} as expected result this would also be valid for 20 | // the actual result {2, 1, 0} but not for {1, 2, 0} - because '2' must be the first int32. 21 | ExpectedResults [][]int32 22 | }{ 23 | { 24 | TestName: "3 Brokers, no rack, RF = 3", 25 | Brokers: []kmsg.MetadataResponseBroker{ 26 | {NodeID: 0, Rack: nil}, 27 | {NodeID: 1, Rack: nil}, 28 | {NodeID: 2, Rack: nil}, 29 | }, 30 | ReplicationFactor: 3, 31 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 2, Rack: nil}, 32 | ExpectedResults: [][]int32{{2, 0, 1}}, 33 | }, 34 | 35 | { 36 | TestName: "3 Brokers, 3 racks, RF = 3", 37 | Brokers: []kmsg.MetadataResponseBroker{ 38 | {NodeID: 0, Rack: kmsg.StringPtr("a")}, 39 | {NodeID: 1, Rack: kmsg.StringPtr("b")}, 40 | {NodeID: 2, Rack: kmsg.StringPtr("c")}, 41 | }, 42 | ReplicationFactor: 3, 43 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 2, Rack: kmsg.StringPtr("c")}, 44 | ExpectedResults: [][]int32{{2, 0, 1}}, 45 | }, 46 | 47 | { 48 | TestName: "3 Brokers, 3 racks, RF = 1", 49 | Brokers: []kmsg.MetadataResponseBroker{ 50 | {NodeID: 0, Rack: kmsg.StringPtr("a")}, 51 | {NodeID: 1, Rack: kmsg.StringPtr("b")}, 52 | {NodeID: 2, Rack: kmsg.StringPtr("c")}, 53 | }, 54 | ReplicationFactor: 1, 55 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 1, Rack: kmsg.StringPtr("b")}, 56 | ExpectedResults: [][]int32{{1}}, 57 | }, 58 | 59 | { 60 | TestName: "3 Brokers, 3 racks, RF = 2", 61 | Brokers: []kmsg.MetadataResponseBroker{ 62 | {NodeID: 0, Rack: kmsg.StringPtr("a")}, 63 | {NodeID: 1, Rack: kmsg.StringPtr("b")}, 64 | {NodeID: 2, Rack: kmsg.StringPtr("c")}, 65 | }, 66 | ReplicationFactor: 2, 67 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 1, Rack: kmsg.StringPtr("b")}, 68 | ExpectedResults: [][]int32{{1, 0}, {1, 2}}, 69 | }, 70 | 71 | { 72 | TestName: "6 Brokers, 3 racks, RF = 3", 73 | Brokers: []kmsg.MetadataResponseBroker{ 74 | {NodeID: 0, Rack: kmsg.StringPtr("a")}, 75 | {NodeID: 1, Rack: kmsg.StringPtr("b")}, 76 | {NodeID: 2, Rack: kmsg.StringPtr("c")}, 77 | {NodeID: 3, Rack: kmsg.StringPtr("a")}, 78 | {NodeID: 4, Rack: kmsg.StringPtr("b")}, 79 | {NodeID: 5, Rack: kmsg.StringPtr("c")}, 80 | }, 81 | ReplicationFactor: 3, 82 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 4, Rack: kmsg.StringPtr("b")}, 83 | ExpectedResults: [][]int32{{4, 0, 2}, {4, 0, 5}, {4, 3, 2}, {4, 3, 5}}, 84 | }, 85 | 86 | { 87 | TestName: "4 Brokers, 2 racks, RF = 3", 88 | Brokers: []kmsg.MetadataResponseBroker{ 89 | {NodeID: 0, Rack: kmsg.StringPtr("a")}, 90 | {NodeID: 1, Rack: kmsg.StringPtr("b")}, 91 | {NodeID: 2, Rack: kmsg.StringPtr("a")}, 92 | {NodeID: 3, Rack: kmsg.StringPtr("b")}, 93 | }, 94 | ReplicationFactor: 3, 95 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 0, Rack: kmsg.StringPtr("a")}, 96 | ExpectedResults: [][]int32{{0, 1, 2}, {0, 1, 3}, {0, 2, 3}}, 97 | }, 98 | 99 | { 100 | TestName: "6 Brokers, 3 racks, RF = 3, lowest node id != 0", 101 | Brokers: []kmsg.MetadataResponseBroker{ 102 | {NodeID: 10, Rack: kmsg.StringPtr("a")}, 103 | {NodeID: 11, Rack: kmsg.StringPtr("b")}, 104 | {NodeID: 12, Rack: kmsg.StringPtr("c")}, 105 | {NodeID: 13, Rack: kmsg.StringPtr("a")}, 106 | {NodeID: 14, Rack: kmsg.StringPtr("b")}, 107 | {NodeID: 15, Rack: kmsg.StringPtr("c")}, 108 | }, 109 | ReplicationFactor: 3, 110 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 11, Rack: kmsg.StringPtr("b")}, 111 | ExpectedResults: [][]int32{{11, 10, 12}, {11, 12, 13}, {11, 13, 15}}, 112 | }, 113 | 114 | { 115 | TestName: "6 Brokers, 3 racks, RF = 5, lowest node id != 0", 116 | Brokers: []kmsg.MetadataResponseBroker{ 117 | {NodeID: 10, Rack: kmsg.StringPtr("a")}, 118 | {NodeID: 11, Rack: kmsg.StringPtr("b")}, 119 | {NodeID: 12, Rack: kmsg.StringPtr("c")}, 120 | {NodeID: 13, Rack: kmsg.StringPtr("a")}, 121 | {NodeID: 14, Rack: kmsg.StringPtr("b")}, 122 | {NodeID: 15, Rack: kmsg.StringPtr("c")}, 123 | }, 124 | ReplicationFactor: 5, 125 | LeaderBroker: kmsg.MetadataResponseBroker{NodeID: 11, Rack: kmsg.StringPtr("b")}, 126 | ExpectedResults: [][]int32{{11, 10, 12, 13, 14}, {11, 10, 13, 14, 15}, {11, 12, 13, 14, 15}, {11, 10, 12, 13, 15}, {11, 10, 12, 14, 15}}, 127 | }, 128 | } 129 | 130 | svc := Service{} 131 | for _, test := range tt { 132 | meta := kmsg.NewMetadataResponse() 133 | meta.Brokers = test.Brokers 134 | replicaIDs := svc.calculateAppropriateReplicas(&meta, test.ReplicationFactor, test.LeaderBroker) 135 | 136 | matchesAtLeastOneExpectedResult := false 137 | for _, possibleResult := range test.ExpectedResults { 138 | isValidResult := possibleResult[0] == replicaIDs[0] && doElementsMatch(possibleResult, replicaIDs) 139 | if isValidResult { 140 | matchesAtLeastOneExpectedResult = true 141 | break 142 | } 143 | } 144 | if !matchesAtLeastOneExpectedResult { 145 | // Use first elementsmatch to print some valid result along with the actual results. 146 | assert.ElementsMatch(t, test.ExpectedResults[0], replicaIDs, test.TestName) 147 | } 148 | } 149 | } 150 | 151 | func doElementsMatch(a, b []int32) bool { 152 | if len(a) != len(b) { 153 | return false 154 | } 155 | 156 | sort.Slice(a, func(i, j int) bool { return a[i] < a[j] }) 157 | sort.Slice(b, func(i, j int) bool { return a[i] < a[j] }) 158 | for i, num := range a { 159 | if num != b[i] { 160 | return false 161 | } 162 | } 163 | 164 | return true 165 | } 166 | -------------------------------------------------------------------------------- /e2e/utils.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "math" 6 | "time" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/twmb/franz-go/pkg/kerr" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | // createHistogramBuckets creates the buckets for the histogram based on the number of desired buckets (10) and the 15 | // upper bucket size. 16 | func createHistogramBuckets(maxLatency time.Duration) []float64 { 17 | // Since this is an exponential bucket we need to take Log base2 or binary as the upper bound 18 | // Divide by 10 for the argument because the base is counted as 20ms and we want to normalize it as base 2 instead of 20 19 | // +2 because it starts at 5ms or 0.005 sec, to account 5ms and 10ms before it goes to the base which in this case is 0.02 sec or 20ms 20 | // and another +1 to account for decimal points on int parsing 21 | latencyCount := math.Logb(float64(maxLatency.Milliseconds() / 10)) 22 | count := int(latencyCount) + 3 23 | bucket := prometheus.ExponentialBuckets(0.005, 2, count) 24 | 25 | return bucket 26 | } 27 | 28 | func containsStr(ar []string, x string) (bool, int) { 29 | for i, item := range ar { 30 | if item == x { 31 | return true, i 32 | } 33 | } 34 | return false, -1 35 | } 36 | 37 | // logCommitErrors logs all errors in commit response and returns a well formatted error code if there was one 38 | func (s *Service) logCommitErrors(r *kmsg.OffsetCommitResponse, err error) string { 39 | if err != nil { 40 | if err == context.DeadlineExceeded { 41 | s.logger.Warn("offset commit failed because SLA has been exceeded") 42 | return "OFFSET_COMMIT_SLA_EXCEEDED" 43 | } 44 | 45 | s.logger.Warn("offset commit failed", zap.Error(err)) 46 | return "RESPONSE_ERROR" 47 | } 48 | 49 | lastErrCode := "" 50 | for _, t := range r.Topics { 51 | for _, p := range t.Partitions { 52 | typedErr := kerr.TypedErrorForCode(p.ErrorCode) 53 | if typedErr == nil { 54 | continue 55 | } 56 | 57 | s.logger.Warn("error committing partition offset", 58 | zap.String("topic", t.Topic), 59 | zap.Int32("partition_id", p.Partition), 60 | zap.Error(typedErr), 61 | ) 62 | lastErrCode = typedErr.Message 63 | } 64 | } 65 | 66 | return lastErrCode 67 | } 68 | 69 | // brokerMetadataByBrokerID returns a map of all broker metadata keyed by their BrokerID 70 | func brokerMetadataByBrokerID(meta []kmsg.MetadataResponseBroker) map[int32]kmsg.MetadataResponseBroker { 71 | res := make(map[int32]kmsg.MetadataResponseBroker) 72 | for _, broker := range meta { 73 | res[broker.NodeID] = broker 74 | } 75 | return res 76 | } 77 | 78 | // brokerMetadataByRackID returns a map of all broker metadata keyed by their Rack identifier 79 | func brokerMetadataByRackID(meta []kmsg.MetadataResponseBroker) map[string][]kmsg.MetadataResponseBroker { 80 | res := make(map[string][]kmsg.MetadataResponseBroker) 81 | for _, broker := range meta { 82 | rackID := "" 83 | if broker.Rack != nil { 84 | rackID = *broker.Rack 85 | } 86 | res[rackID] = append(res[rackID], broker) 87 | } 88 | return res 89 | } 90 | 91 | func pointerStrToStr(str *string) string { 92 | if str == nil { 93 | return "" 94 | } 95 | return *str 96 | } 97 | 98 | func safeUnwrap(err error) string { 99 | if err == nil { 100 | return "" 101 | } 102 | return err.Error() 103 | } 104 | 105 | func isInArray(num int16, arr []int16) bool { 106 | for _, n := range arr { 107 | if num == n { 108 | return true 109 | } 110 | } 111 | return false 112 | } 113 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cloudhut/kminion/v2 2 | 3 | go 1.24 4 | 5 | require ( 6 | github.com/google/uuid v1.6.0 7 | github.com/jcmturner/gokrb5/v8 v8.4.4 8 | github.com/jellydator/ttlcache/v2 v2.11.1 9 | github.com/knadh/koanf v1.5.0 10 | github.com/mitchellh/mapstructure v1.5.0 11 | github.com/orcaman/concurrent-map v1.0.0 12 | github.com/pkg/errors v0.9.1 13 | github.com/prometheus/client_golang v1.20.5 14 | github.com/stretchr/testify v1.9.0 15 | github.com/twmb/franz-go v1.18.0 16 | github.com/twmb/franz-go/pkg/kadm v1.14.0 17 | github.com/twmb/franz-go/pkg/kmsg v1.9.0 18 | github.com/twmb/franz-go/pkg/sasl/kerberos v1.1.0 19 | go.uber.org/atomic v1.11.0 20 | go.uber.org/zap v1.27.0 21 | golang.org/x/sync v0.8.0 22 | ) 23 | 24 | require ( 25 | github.com/beorn7/perks v1.0.1 // indirect 26 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 27 | github.com/davecgh/go-spew v1.1.1 // indirect 28 | github.com/fsnotify/fsnotify v1.8.0 // indirect 29 | github.com/hashicorp/go-uuid v1.0.3 // indirect 30 | github.com/jcmturner/aescts/v2 v2.0.0 // indirect 31 | github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect 32 | github.com/jcmturner/gofork v1.7.6 // indirect 33 | github.com/jcmturner/rpc/v2 v2.0.3 // indirect 34 | github.com/klauspost/compress v1.17.11 // indirect 35 | github.com/mitchellh/copystructure v1.2.0 // indirect 36 | github.com/mitchellh/reflectwalk v1.0.2 // indirect 37 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 38 | github.com/pelletier/go-toml v1.9.1 // indirect 39 | github.com/pierrec/lz4/v4 v4.1.21 // indirect 40 | github.com/pmezard/go-difflib v1.0.0 // indirect 41 | github.com/prometheus/client_model v0.6.1 // indirect 42 | github.com/prometheus/common v0.60.1 // indirect 43 | github.com/prometheus/procfs v0.15.1 // indirect 44 | go.uber.org/multierr v1.11.0 // indirect 45 | golang.org/x/crypto v0.36.0 // indirect 46 | golang.org/x/net v0.37.0 // indirect 47 | golang.org/x/sys v0.31.0 // indirect 48 | google.golang.org/protobuf v1.35.1 // indirect 49 | gopkg.in/yaml.v3 v3.0.1 // indirect 50 | ) 51 | -------------------------------------------------------------------------------- /kafka/client_config_helper.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "crypto/x509" 7 | "encoding/pem" 8 | "fmt" 9 | "io/ioutil" 10 | "net" 11 | "time" 12 | 13 | "github.com/jcmturner/gokrb5/v8/client" 14 | "github.com/jcmturner/gokrb5/v8/keytab" 15 | "github.com/twmb/franz-go/pkg/kgo" 16 | "github.com/twmb/franz-go/pkg/kversion" 17 | "github.com/twmb/franz-go/pkg/sasl" 18 | "github.com/twmb/franz-go/pkg/sasl/kerberos" 19 | "github.com/twmb/franz-go/pkg/sasl/oauth" 20 | "github.com/twmb/franz-go/pkg/sasl/plain" 21 | "github.com/twmb/franz-go/pkg/sasl/scram" 22 | "go.uber.org/zap" 23 | 24 | krbconfig "github.com/jcmturner/gokrb5/v8/config" 25 | ) 26 | 27 | // NewKgoConfig creates a new Config for the Kafka Client as exposed by the franz-go library. 28 | // If TLS certificates can't be read an error will be returned. 29 | // logger is only used to print warnings about TLS. 30 | func NewKgoConfig(cfg Config, logger *zap.Logger) ([]kgo.Opt, error) { 31 | opts := []kgo.Opt{ 32 | kgo.SeedBrokers(cfg.Brokers...), 33 | kgo.MaxVersions(kversion.V2_7_0()), 34 | kgo.ClientID(cfg.ClientID), 35 | kgo.FetchMaxBytes(5 * 1000 * 1000), // 5MB 36 | kgo.MaxConcurrentFetches(10), 37 | // Allow metadata to be refreshed more often than 5s (default) if needed. 38 | // That will mitigate issues with unknown partitions shortly after creating 39 | // them. 40 | kgo.MetadataMinAge(time.Second), 41 | } 42 | 43 | // Create Logger 44 | kgoLogger := KgoZapLogger{ 45 | logger: logger.Sugar(), 46 | } 47 | opts = append(opts, kgo.WithLogger(kgoLogger)) 48 | 49 | // Add Rack Awareness if configured 50 | if cfg.RackID != "" { 51 | opts = append(opts, kgo.Rack(cfg.RackID)) 52 | } 53 | 54 | // Configure SASL 55 | if cfg.SASL.Enabled { 56 | // SASL Plain 57 | if cfg.SASL.Mechanism == "PLAIN" { 58 | mechanism := plain.Auth{ 59 | User: cfg.SASL.Username, 60 | Pass: cfg.SASL.Password, 61 | }.AsMechanism() 62 | opts = append(opts, kgo.SASL(mechanism)) 63 | } 64 | 65 | // SASL SCRAM 66 | if cfg.SASL.Mechanism == "SCRAM-SHA-256" || cfg.SASL.Mechanism == "SCRAM-SHA-512" { 67 | var mechanism sasl.Mechanism 68 | scramAuth := scram.Auth{ 69 | User: cfg.SASL.Username, 70 | Pass: cfg.SASL.Password, 71 | } 72 | if cfg.SASL.Mechanism == "SCRAM-SHA-256" { 73 | mechanism = scramAuth.AsSha256Mechanism() 74 | } 75 | if cfg.SASL.Mechanism == "SCRAM-SHA-512" { 76 | mechanism = scramAuth.AsSha512Mechanism() 77 | } 78 | opts = append(opts, kgo.SASL(mechanism)) 79 | } 80 | 81 | // Kerberos 82 | if cfg.SASL.Mechanism == "GSSAPI" { 83 | var krbClient *client.Client 84 | 85 | kerbCfg, err := krbconfig.Load(cfg.SASL.GSSAPI.KerberosConfigPath) 86 | if err != nil { 87 | return nil, fmt.Errorf("failed to create kerberos config from specified config filepath: %w", err) 88 | } 89 | 90 | switch cfg.SASL.GSSAPI.AuthType { 91 | case "USER_AUTH:": 92 | krbClient = client.NewWithPassword( 93 | cfg.SASL.GSSAPI.Username, 94 | cfg.SASL.GSSAPI.Realm, 95 | cfg.SASL.GSSAPI.Password, 96 | kerbCfg, 97 | client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast)) 98 | case "KEYTAB_AUTH": 99 | ktb, err := keytab.Load(cfg.SASL.GSSAPI.KeyTabPath) 100 | if err != nil { 101 | return nil, fmt.Errorf("failed to load keytab: %w", err) 102 | } 103 | krbClient = client.NewWithKeytab( 104 | cfg.SASL.GSSAPI.Username, 105 | cfg.SASL.GSSAPI.Realm, 106 | ktb, 107 | kerbCfg, 108 | client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast)) 109 | } 110 | kerberosMechanism := kerberos.Auth{ 111 | Client: krbClient, 112 | Service: cfg.SASL.GSSAPI.ServiceName, 113 | PersistAfterAuth: true, 114 | }.AsMechanism() 115 | opts = append(opts, kgo.SASL(kerberosMechanism)) 116 | } 117 | 118 | // OAuthBearer 119 | if cfg.SASL.Mechanism == "OAUTHBEARER" { 120 | mechanism := oauth.Oauth(func(ctx context.Context) (oauth.Auth, error) { 121 | token, err := cfg.SASL.OAuthBearer.getToken(ctx) 122 | return oauth.Auth{ 123 | Zid: cfg.SASL.OAuthBearer.ClientID, 124 | Token: token, 125 | }, err 126 | }) 127 | opts = append(opts, kgo.SASL(mechanism)) 128 | } 129 | } 130 | 131 | // Configure TLS 132 | var caCertPool *x509.CertPool 133 | if cfg.TLS.Enabled { 134 | // Root CA 135 | if cfg.TLS.CaFilepath != "" || len(cfg.TLS.Ca) > 0 { 136 | ca := []byte(cfg.TLS.Ca) 137 | if cfg.TLS.CaFilepath != "" { 138 | caBytes, err := ioutil.ReadFile(cfg.TLS.CaFilepath) 139 | if err != nil { 140 | return nil, fmt.Errorf("failed to load ca cert: %w", err) 141 | } 142 | ca = caBytes 143 | } 144 | caCertPool = x509.NewCertPool() 145 | isSuccessful := caCertPool.AppendCertsFromPEM(ca) 146 | if !isSuccessful { 147 | logger.Warn("failed to append ca file to cert pool, is this a valid PEM format?") 148 | } 149 | } 150 | 151 | // If configured load TLS cert & key - Mutual TLS 152 | var certificates []tls.Certificate 153 | hasCertFile := cfg.TLS.CertFilepath != "" || len(cfg.TLS.Cert) > 0 154 | hasKeyFile := cfg.TLS.KeyFilepath != "" || len(cfg.TLS.Key) > 0 155 | if hasCertFile || hasKeyFile { 156 | cert := []byte(cfg.TLS.Cert) 157 | privateKey := []byte(cfg.TLS.Key) 158 | // 1. Read certificates 159 | if cfg.TLS.CertFilepath != "" { 160 | certBytes, err := ioutil.ReadFile(cfg.TLS.CertFilepath) 161 | if err != nil { 162 | return nil, fmt.Errorf("failed to TLS certificate: %w", err) 163 | } 164 | cert = certBytes 165 | } 166 | 167 | if cfg.TLS.KeyFilepath != "" { 168 | keyBytes, err := ioutil.ReadFile(cfg.TLS.KeyFilepath) 169 | if err != nil { 170 | return nil, fmt.Errorf("failed to read TLS key: %w", err) 171 | } 172 | privateKey = keyBytes 173 | } 174 | 175 | // 2. Check if private key needs to be decrypted. Decrypt it if passphrase is given, otherwise return error 176 | pemBlock, _ := pem.Decode(privateKey) 177 | if pemBlock == nil { 178 | return nil, fmt.Errorf("no valid private key found") 179 | } 180 | 181 | if x509.IsEncryptedPEMBlock(pemBlock) { 182 | decryptedKey, err := x509.DecryptPEMBlock(pemBlock, []byte(cfg.TLS.Passphrase)) 183 | if err != nil { 184 | return nil, fmt.Errorf("private key is encrypted, but could not decrypt it: %s", err) 185 | } 186 | // If private key was encrypted we can overwrite the original contents now with the decrypted version 187 | privateKey = pem.EncodeToMemory(&pem.Block{Type: pemBlock.Type, Bytes: decryptedKey}) 188 | } 189 | tlsCert, err := tls.X509KeyPair(cert, privateKey) 190 | if err != nil { 191 | return nil, fmt.Errorf("cannot parse pem: %s", err) 192 | } 193 | certificates = []tls.Certificate{tlsCert} 194 | } 195 | 196 | tlsDialer := &tls.Dialer{ 197 | NetDialer: &net.Dialer{Timeout: 10 * time.Second}, 198 | Config: &tls.Config{ 199 | InsecureSkipVerify: cfg.TLS.InsecureSkipTLSVerify, 200 | Certificates: certificates, 201 | RootCAs: caCertPool, 202 | }, 203 | } 204 | opts = append(opts, kgo.Dialer(tlsDialer.DialContext)) 205 | } 206 | 207 | return opts, nil 208 | } 209 | -------------------------------------------------------------------------------- /kafka/client_logger.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "github.com/twmb/franz-go/pkg/kgo" 5 | "go.uber.org/zap" 6 | ) 7 | 8 | type KgoZapLogger struct { 9 | logger *zap.SugaredLogger 10 | } 11 | 12 | // Level Implements kgo.Logger interface. It returns the log level to log at. 13 | // We pin this to debug as the zap logger decides what to actually send to the output stream. 14 | func (k KgoZapLogger) Level() kgo.LogLevel { 15 | return kgo.LogLevelDebug 16 | } 17 | 18 | // Log implements kgo.Logger interface 19 | func (k KgoZapLogger) Log(level kgo.LogLevel, msg string, keyvals ...interface{}) { 20 | switch level { 21 | case kgo.LogLevelDebug: 22 | k.logger.Debugw(msg, keyvals...) 23 | case kgo.LogLevelInfo: 24 | k.logger.Infow(msg, keyvals...) 25 | case kgo.LogLevelWarn: 26 | k.logger.Warnw(msg, keyvals...) 27 | case kgo.LogLevelError: 28 | k.logger.Errorw(msg, keyvals...) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /kafka/config.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import "fmt" 4 | 5 | type Config struct { 6 | // General 7 | Brokers []string `koanf:"brokers"` 8 | ClientID string `koanf:"clientId"` 9 | RackID string `koanf:"rackId"` 10 | 11 | TLS TLSConfig `koanf:"tls"` 12 | SASL SASLConfig `koanf:"sasl"` 13 | 14 | RetryInitConnection bool `koanf:"retryInitConnection"` 15 | } 16 | 17 | func (c *Config) SetDefaults() { 18 | c.ClientID = "kminion" 19 | 20 | c.TLS.SetDefaults() 21 | c.SASL.SetDefaults() 22 | } 23 | 24 | func (c *Config) Validate() error { 25 | if len(c.Brokers) == 0 { 26 | return fmt.Errorf("no seed brokers specified, at least one must be configured") 27 | } 28 | 29 | err := c.TLS.Validate() 30 | if err != nil { 31 | return fmt.Errorf("failed to validate TLS config: %w", err) 32 | } 33 | 34 | err = c.SASL.Validate() 35 | if err != nil { 36 | return fmt.Errorf("failed to validate SASL config: %w", err) 37 | } 38 | 39 | return nil 40 | } 41 | -------------------------------------------------------------------------------- /kafka/config_sasl.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import "fmt" 4 | 5 | const ( 6 | SASLMechanismPlain = "PLAIN" 7 | SASLMechanismScramSHA256 = "SCRAM-SHA-256" 8 | SASLMechanismScramSHA512 = "SCRAM-SHA-512" 9 | SASLMechanismGSSAPI = "GSSAPI" 10 | SASLMechanismOAuthBearer = "OAUTHBEARER" 11 | ) 12 | 13 | // SASLConfig for Kafka Client 14 | type SASLConfig struct { 15 | Enabled bool `koanf:"enabled"` 16 | Username string `koanf:"username"` 17 | Password string `koanf:"password"` 18 | Mechanism string `koanf:"mechanism"` 19 | 20 | // SASL Mechanisms that require more configuration than username & password 21 | GSSAPI SASLGSSAPIConfig `koanf:"gssapi"` 22 | OAuthBearer OAuthBearerConfig `koanf:"oauth"` 23 | } 24 | 25 | // SetDefaults for SASL Config 26 | func (c *SASLConfig) SetDefaults() { 27 | c.Enabled = false 28 | c.Mechanism = SASLMechanismPlain 29 | c.GSSAPI.SetDefaults() 30 | } 31 | 32 | // Validate SASL config input 33 | func (c *SASLConfig) Validate() error { 34 | if !c.Enabled { 35 | return nil 36 | } 37 | 38 | switch c.Mechanism { 39 | case SASLMechanismPlain, SASLMechanismScramSHA256, SASLMechanismScramSHA512, SASLMechanismGSSAPI: 40 | // Valid and supported 41 | case SASLMechanismOAuthBearer: 42 | return c.OAuthBearer.Validate() 43 | default: 44 | return fmt.Errorf("given sasl mechanism '%v' is invalid", c.Mechanism) 45 | } 46 | 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /kafka/config_sasl_gssapi.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | // SASLGSSAPIConfig represents the Kafka Kerberos config 4 | type SASLGSSAPIConfig struct { 5 | AuthType string `koanf:"authType"` 6 | KeyTabPath string `koanf:"keyTabPath"` 7 | KerberosConfigPath string `koanf:"kerberosConfigPath"` 8 | ServiceName string `koanf:"serviceName"` 9 | Username string `koanf:"username"` 10 | Password string `koanf:"password"` 11 | Realm string `koanf:"realm"` 12 | 13 | // EnableFAST enables FAST, which is a pre-authentication framework for Kerberos. 14 | // It includes a mechanism for tunneling pre-authentication exchanges using armoured KDC messages. 15 | // FAST provides increased resistance to passive password guessing attacks. 16 | EnableFast bool `koanf:"enableFast"` 17 | } 18 | 19 | func (s *SASLGSSAPIConfig) SetDefaults() { 20 | s.EnableFast = true 21 | } 22 | -------------------------------------------------------------------------------- /kafka/config_sasl_oauthbearer.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "context" 5 | "encoding/base64" 6 | "encoding/json" 7 | "fmt" 8 | "net/http" 9 | "net/url" 10 | "strings" 11 | ) 12 | 13 | type OAuthBearerConfig struct { 14 | TokenEndpoint string `koanf:"tokenEndpoint"` 15 | ClientID string `koanf:"clientId"` 16 | ClientSecret string `koanf:"clientSecret"` 17 | Scope string `koanf:"scope"` 18 | } 19 | 20 | func (c *OAuthBearerConfig) Validate() error { 21 | if c.TokenEndpoint == "" { 22 | return fmt.Errorf("OAuthBearer token endpoint is not specified") 23 | } 24 | if c.ClientID == "" || c.ClientSecret == "" { 25 | return fmt.Errorf("OAuthBearer client credentials are not specified") 26 | } 27 | return nil 28 | } 29 | 30 | // same as AcquireToken in Console https://github.com/redpanda-data/console/blob/master/backend/pkg/config/kafka_sasl_oauth.go#L56 31 | func (c *OAuthBearerConfig) getToken(ctx context.Context) (string, error) { 32 | authHeaderValue := base64.StdEncoding.EncodeToString([]byte(c.ClientID + ":" + c.ClientSecret)) 33 | 34 | queryParams := url.Values{ 35 | "grant_type": []string{"client_credentials"}, 36 | "scope": []string{c.Scope}, 37 | } 38 | 39 | req, err := http.NewRequestWithContext(ctx, "POST", c.TokenEndpoint, strings.NewReader(queryParams.Encode())) 40 | if err != nil { 41 | return "", fmt.Errorf("failed to create HTTP request: %w", err) 42 | } 43 | 44 | req.URL.RawQuery = queryParams.Encode() 45 | 46 | req.Header.Set("Authorization", "Basic "+authHeaderValue) 47 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 48 | 49 | client := &http.Client{} 50 | 51 | resp, err := client.Do(req) 52 | if err != nil { 53 | return "", fmt.Errorf("HTTP request failed: %w", err) 54 | } 55 | defer resp.Body.Close() 56 | 57 | if resp.StatusCode != http.StatusOK { 58 | return "", fmt.Errorf("token request failed with status code %d", resp.StatusCode) 59 | } 60 | 61 | var tokenResponse map[string]interface{} 62 | decoder := json.NewDecoder(resp.Body) 63 | if err := decoder.Decode(&tokenResponse); err != nil { 64 | return "", fmt.Errorf("failed to parse token response: %w", err) 65 | } 66 | 67 | accessToken, ok := tokenResponse["access_token"].(string) 68 | if !ok { 69 | return "", fmt.Errorf("access_token not found in token response") 70 | } 71 | 72 | return accessToken, nil 73 | } 74 | -------------------------------------------------------------------------------- /kafka/config_tls.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import "fmt" 4 | 5 | // TLSConfig to connect to Kafka via TLS 6 | type TLSConfig struct { 7 | Enabled bool `koanf:"enabled"` 8 | CaFilepath string `koanf:"caFilepath"` 9 | CertFilepath string `koanf:"certFilepath"` 10 | KeyFilepath string `koanf:"keyFilepath"` 11 | Ca string `koanf:"ca"` 12 | Cert string `koanf:"cert"` 13 | Key string `koanf:"key"` 14 | Passphrase string `koanf:"passphrase"` 15 | InsecureSkipTLSVerify bool `koanf:"insecureSkipTlsVerify"` 16 | } 17 | 18 | func (c *TLSConfig) SetDefaults() { 19 | c.Enabled = false 20 | } 21 | 22 | func (c *TLSConfig) Validate() error { 23 | if len(c.CaFilepath) > 0 && len(c.Ca) > 0 { 24 | return fmt.Errorf("config keys 'caFilepath' and 'ca' are both set. only one can be used at the same time") 25 | } 26 | if len(c.CertFilepath) > 0 && len(c.Cert) > 0 { 27 | return fmt.Errorf("config keys 'certFilepath' and 'cert' are both set. only one can be used at the same time") 28 | } 29 | 30 | if len(c.KeyFilepath) > 0 && len(c.Key) > 0 { 31 | return fmt.Errorf("config keys 'keyFilepath' and 'key' are both set. only one can be used at the same time") 32 | } 33 | return nil 34 | } 35 | -------------------------------------------------------------------------------- /kafka/service.go: -------------------------------------------------------------------------------- 1 | package kafka 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kerr" 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "github.com/twmb/franz-go/pkg/kversion" 12 | "go.uber.org/zap" 13 | ) 14 | 15 | type Service struct { 16 | cfg Config 17 | logger *zap.Logger 18 | } 19 | 20 | func NewService(cfg Config, logger *zap.Logger) *Service { 21 | return &Service{ 22 | cfg: cfg, 23 | logger: logger.Named("kafka_service"), 24 | } 25 | } 26 | 27 | // CreateAndTestClient creates a client with the services default settings 28 | // logger: will be used to log connections, errors, warnings about tls config, ... 29 | func (s *Service) CreateAndTestClient(ctx context.Context, l *zap.Logger, opts []kgo.Opt) (*kgo.Client, error) { 30 | logger := l.Named("kgo_client") 31 | // Config with default options 32 | kgoOpts, err := NewKgoConfig(s.cfg, logger) 33 | if err != nil { 34 | return nil, fmt.Errorf("failed to create a valid kafka Client config: %w", err) 35 | } 36 | // Append user (the service calling this method) provided options 37 | kgoOpts = append(kgoOpts, opts...) 38 | 39 | // Create kafka client 40 | client, err := kgo.NewClient(kgoOpts...) 41 | if err != nil { 42 | return nil, fmt.Errorf("failed to create kafka Client: %w", err) 43 | } 44 | 45 | // Test connection 46 | for { 47 | err = s.testConnection(client, ctx) 48 | if err == nil { 49 | break 50 | } 51 | 52 | if !s.cfg.RetryInitConnection { 53 | return nil, fmt.Errorf("failed to test connectivity to Kafka cluster %w", err) 54 | } 55 | 56 | logger.Warn("failed to test connectivity to Kafka cluster, retrying in 5 seconds", zap.Error(err)) 57 | time.Sleep(time.Second * 5) 58 | } 59 | 60 | return client, nil 61 | } 62 | 63 | // Brokers returns list of brokers this service is connecting to 64 | func (s *Service) Brokers() []string { 65 | return s.cfg.Brokers 66 | } 67 | 68 | // testConnection tries to fetch Broker metadata and prints some information if connection succeeds. An error will be 69 | // returned if connecting fails. 70 | func (s *Service) testConnection(client *kgo.Client, ctx context.Context) error { 71 | connectCtx, cancel := context.WithTimeout(ctx, 15*time.Second) 72 | defer cancel() 73 | 74 | req := kmsg.MetadataRequest{ 75 | Topics: nil, 76 | } 77 | res, err := req.RequestWith(connectCtx, client) 78 | if err != nil { 79 | return fmt.Errorf("failed to request metadata: %w", err) 80 | } 81 | 82 | // Request versions in order to guess Kafka Cluster version 83 | versionsReq := kmsg.NewApiVersionsRequest() 84 | versionsRes, err := versionsReq.RequestWith(connectCtx, client) 85 | if err != nil { 86 | return fmt.Errorf("failed to request api versions: %w", err) 87 | } 88 | err = kerr.ErrorForCode(versionsRes.ErrorCode) 89 | if err != nil { 90 | return fmt.Errorf("failed to request api versions. Inner kafka error: %w", err) 91 | } 92 | versions := kversion.FromApiVersionsResponse(versionsRes) 93 | 94 | s.logger.Debug("successfully connected to kafka cluster", 95 | zap.Int("advertised_broker_count", len(res.Brokers)), 96 | zap.Int("topic_count", len(res.Topics)), 97 | zap.Int32("controller_id", res.ControllerID), 98 | zap.String("kafka_version", versions.VersionGuess())) 99 | 100 | return nil 101 | } 102 | -------------------------------------------------------------------------------- /logging/config.go: -------------------------------------------------------------------------------- 1 | package logging 2 | 3 | import ( 4 | "fmt" 5 | "go.uber.org/zap" 6 | ) 7 | 8 | type Config struct { 9 | Level string `koanf:"level"` 10 | } 11 | 12 | func (c *Config) SetDefaults() { 13 | c.Level = "info" 14 | } 15 | 16 | func (c *Config) Validate() error { 17 | level := zap.NewAtomicLevel() 18 | err := level.UnmarshalText([]byte(c.Level)) 19 | if err != nil { 20 | return fmt.Errorf("failed to parse logger level: %w", err) 21 | } 22 | 23 | return nil 24 | } 25 | -------------------------------------------------------------------------------- /logging/logger.go: -------------------------------------------------------------------------------- 1 | package logging 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/prometheus/client_golang/prometheus" 7 | "github.com/prometheus/client_golang/prometheus/promauto" 8 | "go.uber.org/zap/zapcore" 9 | 10 | "go.uber.org/zap" 11 | ) 12 | 13 | // NewLogger creates a preconfigured global logger and configures the global zap logger 14 | func NewLogger(cfg Config, metricsNamespace string) *zap.Logger { 15 | encoderCfg := zap.NewProductionEncoderConfig() 16 | encoderCfg.EncodeTime = zapcore.ISO8601TimeEncoder 17 | 18 | // Parse log level text to zap.LogLevel. Error check isn't required because the input is already validated. 19 | level := zap.NewAtomicLevel() 20 | _ = level.UnmarshalText([]byte(cfg.Level)) 21 | 22 | core := zapcore.NewCore( 23 | zapcore.NewJSONEncoder(encoderCfg), 24 | zapcore.Lock(os.Stdout), 25 | level, 26 | ) 27 | core = zapcore.RegisterHooks(core, prometheusHook(metricsNamespace)) 28 | logger := zap.New(core) 29 | zap.ReplaceGlobals(logger) 30 | 31 | return logger 32 | } 33 | 34 | // prometheusHook is a hook for the zap library which exposes Prometheus counters for various log levels. 35 | func prometheusHook(metricsNamespace string) func(zapcore.Entry) error { 36 | messageCounterVec := promauto.NewCounterVec(prometheus.CounterOpts{ 37 | Namespace: metricsNamespace, 38 | Name: "log_messages_total", 39 | Help: "Total number of log messages by log level emitted by KMinion.", 40 | }, []string{"level"}) 41 | 42 | // Initialize counters for all supported log levels so that they expose 0 for each level on startup 43 | supportedLevels := []zapcore.Level{ 44 | zapcore.DebugLevel, 45 | zapcore.InfoLevel, 46 | zapcore.WarnLevel, 47 | zapcore.ErrorLevel, 48 | zapcore.FatalLevel, 49 | zapcore.PanicLevel, 50 | } 51 | for _, level := range supportedLevels { 52 | messageCounterVec.WithLabelValues(level.String()) 53 | } 54 | 55 | return func(entry zapcore.Entry) error { 56 | messageCounterVec.WithLabelValues(entry.Level.String()).Inc() 57 | return nil 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "net" 8 | "net/http" 9 | "os" 10 | "os/signal" 11 | "strconv" 12 | 13 | "github.com/cloudhut/kminion/v2/e2e" 14 | "github.com/cloudhut/kminion/v2/kafka" 15 | "github.com/cloudhut/kminion/v2/logging" 16 | "github.com/cloudhut/kminion/v2/minion" 17 | "github.com/cloudhut/kminion/v2/prometheus" 18 | promclient "github.com/prometheus/client_golang/prometheus" 19 | "github.com/prometheus/client_golang/prometheus/promhttp" 20 | "go.uber.org/zap" 21 | ) 22 | 23 | var ( 24 | // ------------------------------------------------------------------------ 25 | // Below parameters are set at build time using ldflags. 26 | // ------------------------------------------------------------------------ 27 | 28 | // version is KMinion's SemVer version (for example: v1.0.0). 29 | version = "development" 30 | // builtAt is a string that represent a human-readable date when the binary was built. 31 | builtAt = "N/A" 32 | // commit is a string that represents the last git commit for this build. 33 | commit = "N/A" 34 | ) 35 | 36 | func main() { 37 | startupLogger, err := zap.NewProduction() 38 | if err != nil { 39 | panic(fmt.Errorf("failed to create startup logger: %w", err)) 40 | } 41 | 42 | cfg, err := newConfig(startupLogger) 43 | if err != nil { 44 | startupLogger.Fatal("failed to parse config", zap.Error(err)) 45 | } 46 | 47 | logger := logging.NewLogger(cfg.Logger, cfg.Exporter.Namespace).Named("main") 48 | if err != nil { 49 | startupLogger.Fatal("failed to create new logger", zap.Error(err)) 50 | } 51 | 52 | logger.Info("started kminion", zap.String("version", version), zap.String("built_at", builtAt)) 53 | 54 | // Setup context that stops when the application receives an interrupt signal 55 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) 56 | defer stop() 57 | 58 | wrappedRegisterer := promclient.WrapRegistererWithPrefix(cfg.Exporter.Namespace+"_", promclient.DefaultRegisterer) 59 | 60 | // Create kafka service 61 | kafkaSvc := kafka.NewService(cfg.Kafka, logger) 62 | 63 | // Create minion service 64 | // Prometheus exporter only talks to the minion service which 65 | // issues all the requests to Kafka and wraps the interface accordingly. 66 | minionSvc, err := minion.NewService(cfg.Minion, logger, kafkaSvc, cfg.Exporter.Namespace, ctx) 67 | if err != nil { 68 | logger.Fatal("failed to setup minion service", zap.Error(err)) 69 | } 70 | 71 | err = minionSvc.Start(ctx) 72 | if err != nil { 73 | logger.Fatal("failed to start minion service", zap.Error(err)) 74 | } 75 | 76 | // Create end to end testing service 77 | if cfg.Minion.EndToEnd.Enabled { 78 | e2eService, err := e2e.NewService( 79 | ctx, 80 | cfg.Minion.EndToEnd, 81 | logger, 82 | kafkaSvc, 83 | wrappedRegisterer, 84 | ) 85 | if err != nil { 86 | logger.Fatal("failed to create end-to-end monitoring service: %w", zap.Error(err)) 87 | } 88 | 89 | if err = e2eService.Start(ctx); err != nil { 90 | logger.Fatal("failed to start end-to-end monitoring service", zap.Error(err)) 91 | } 92 | } 93 | 94 | // The Prometheus exporter that implements the Prometheus collector interface 95 | exporter, err := prometheus.NewExporter(cfg.Exporter, logger, minionSvc) 96 | if err != nil { 97 | logger.Fatal("failed to setup prometheus exporter", zap.Error(err)) 98 | } 99 | exporter.InitializeMetrics() 100 | 101 | promclient.MustRegister(exporter) 102 | http.Handle("/metrics", 103 | promhttp.InstrumentMetricHandler( 104 | promclient.DefaultRegisterer, 105 | promhttp.HandlerFor( 106 | promclient.DefaultGatherer, 107 | promhttp.HandlerOpts{}, 108 | ), 109 | ), 110 | ) 111 | http.Handle("/ready", minionSvc.HandleIsReady()) 112 | 113 | // Start HTTP server 114 | address := net.JoinHostPort(cfg.Exporter.Host, strconv.Itoa(cfg.Exporter.Port)) 115 | srv := &http.Server{Addr: address} 116 | go func() { 117 | <-ctx.Done() 118 | if err := srv.Shutdown(context.Background()); err != nil { 119 | logger.Error("error stopping HTTP server", zap.Error(err)) 120 | os.Exit(1) 121 | } 122 | }() 123 | logger.Info("listening on address", zap.String("listen_address", address)) 124 | if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { 125 | logger.Error("error starting HTTP server", zap.Error(err)) 126 | os.Exit(1) 127 | } 128 | 129 | logger.Info("kminion stopped") 130 | } 131 | -------------------------------------------------------------------------------- /minion/client_hooks.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "net" 5 | "time" 6 | 7 | "github.com/prometheus/client_golang/prometheus" 8 | "github.com/prometheus/client_golang/prometheus/promauto" 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "go.uber.org/zap" 11 | ) 12 | 13 | // clientHooks implements the various hook interfaces from the franz-go (kafka) library. We can use these hooks to 14 | // log additional information, collect Prometheus metrics and similar. 15 | type clientHooks struct { 16 | logger *zap.Logger 17 | 18 | requestSentCount prometheus.Counter 19 | bytesSent prometheus.Counter 20 | 21 | requestsReceivedCount prometheus.Counter 22 | bytesReceived prometheus.Counter 23 | } 24 | 25 | func newMinionClientHooks(logger *zap.Logger, metricsNamespace string) *clientHooks { 26 | requestSentCount := promauto.NewCounter(prometheus.CounterOpts{ 27 | Namespace: metricsNamespace, 28 | Subsystem: "kafka", 29 | Name: "requests_sent_total"}) 30 | bytesSent := promauto.NewCounter(prometheus.CounterOpts{ 31 | Namespace: metricsNamespace, 32 | Subsystem: "kafka", 33 | Name: "sent_bytes", 34 | }) 35 | 36 | requestsReceivedCount := promauto.NewCounter(prometheus.CounterOpts{ 37 | Namespace: metricsNamespace, 38 | Subsystem: "kafka", 39 | Name: "requests_received_total"}) 40 | bytesReceived := promauto.NewCounter(prometheus.CounterOpts{ 41 | Namespace: metricsNamespace, 42 | Subsystem: "kafka", 43 | Name: "received_bytes", 44 | }) 45 | 46 | return &clientHooks{ 47 | logger: logger, 48 | 49 | requestSentCount: requestSentCount, 50 | bytesSent: bytesSent, 51 | 52 | requestsReceivedCount: requestsReceivedCount, 53 | bytesReceived: bytesReceived, 54 | } 55 | } 56 | 57 | func (c clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) { 58 | if err != nil { 59 | c.logger.Debug("kafka connection failed", zap.String("broker_host", meta.Host), zap.Error(err)) 60 | return 61 | } 62 | c.logger.Debug("kafka connection succeeded", 63 | zap.String("host", meta.Host), 64 | zap.Duration("dial_duration", dialDur)) 65 | } 66 | 67 | func (c clientHooks) OnBrokerDisconnect(meta kgo.BrokerMetadata, _ net.Conn) { 68 | c.logger.Debug("kafka broker disconnected", 69 | zap.String("host", meta.Host)) 70 | } 71 | 72 | // OnBrokerRead is passed the broker metadata, the key for the response that 73 | // was read, the number of bytes read, how long the Client waited 74 | // before reading the response, how long it took to read the response, 75 | // and any error. 76 | // 77 | // The bytes written does not count any tls overhead. 78 | // OnRead is called after a read from a broker. 79 | func (c clientHooks) OnBrokerRead(_ kgo.BrokerMetadata, _ int16, bytesRead int, _, _ time.Duration, _ error) { 80 | c.requestsReceivedCount.Inc() 81 | c.bytesReceived.Add(float64(bytesRead)) 82 | } 83 | 84 | // OnBrokerWrite is passed the broker metadata, the key for the request that 85 | // was written, the number of bytes written, how long the request 86 | // waited before being written, how long it took to write the request, 87 | // and any error. 88 | // 89 | // The bytes written does not count any tls overhead. 90 | // OnWrite is called after a write to a broker. 91 | func (c clientHooks) OnBrokerWrite(_ kgo.BrokerMetadata, _ int16, bytesWritten int, _, _ time.Duration, _ error) { 92 | c.requestSentCount.Inc() 93 | c.bytesSent.Add(float64(bytesWritten)) 94 | } 95 | -------------------------------------------------------------------------------- /minion/config.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/cloudhut/kminion/v2/e2e" 7 | ) 8 | 9 | type Config struct { 10 | ConsumerGroups ConsumerGroupConfig `koanf:"consumerGroups"` 11 | Topics TopicConfig `koanf:"topics"` 12 | LogDirs LogDirsConfig `koanf:"logDirs"` 13 | EndToEnd e2e.Config `koanf:"endToEnd"` 14 | } 15 | 16 | func (c *Config) SetDefaults() { 17 | c.ConsumerGroups.SetDefaults() 18 | c.Topics.SetDefaults() 19 | c.LogDirs.SetDefaults() 20 | c.EndToEnd.SetDefaults() 21 | } 22 | 23 | func (c *Config) Validate() error { 24 | err := c.ConsumerGroups.Validate() 25 | if err != nil { 26 | return fmt.Errorf("failed to consumer group config: %w", err) 27 | } 28 | 29 | err = c.Topics.Validate() 30 | if err != nil { 31 | return fmt.Errorf("failed to validate topic config: %w", err) 32 | } 33 | 34 | err = c.LogDirs.Validate() 35 | if err != nil { 36 | return fmt.Errorf("failed to validate log dirs config: %w", err) 37 | } 38 | 39 | err = c.EndToEnd.Validate() 40 | if err != nil { 41 | return fmt.Errorf("failed to validate endToEnd config: %w", err) 42 | } 43 | 44 | return nil 45 | } 46 | -------------------------------------------------------------------------------- /minion/config_consumer_group.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | const ( 8 | ConsumerGroupScrapeModeOffsetsTopic string = "offsetsTopic" 9 | ConsumerGroupScrapeModeAdminAPI string = "adminApi" 10 | 11 | ConsumerGroupGranularityTopic string = "topic" 12 | ConsumerGroupGranularityPartition string = "partition" 13 | ) 14 | 15 | type ConsumerGroupConfig struct { 16 | // Enabled specifies whether consumer groups shall be scraped and exported or not. 17 | Enabled bool `koanf:"enabled"` 18 | 19 | // Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal 20 | // __consumer_offsets topic. 21 | ScrapeMode string `koanf:"scrapeMode"` 22 | 23 | // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and 24 | // you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed 25 | // and only topic lags will be exported. 26 | Granularity string `koanf:"granularity"` 27 | 28 | // AllowedGroups are regex strings of group ids that shall be exported 29 | AllowedGroupIDs []string `koanf:"allowedGroups"` 30 | 31 | // IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups 32 | // take precedence over allowed groups. 33 | IgnoredGroupIDs []string `koanf:"ignoredGroups"` 34 | } 35 | 36 | func (c *ConsumerGroupConfig) SetDefaults() { 37 | c.Enabled = true 38 | c.ScrapeMode = ConsumerGroupScrapeModeAdminAPI 39 | c.Granularity = ConsumerGroupGranularityPartition 40 | c.AllowedGroupIDs = []string{"/.*/"} 41 | } 42 | 43 | func (c *ConsumerGroupConfig) Validate() error { 44 | switch c.ScrapeMode { 45 | case ConsumerGroupScrapeModeOffsetsTopic, ConsumerGroupScrapeModeAdminAPI: 46 | default: 47 | return fmt.Errorf("invalid scrape mode '%v' specified. Valid modes are '%v' or '%v'", 48 | c.ScrapeMode, 49 | ConsumerGroupScrapeModeOffsetsTopic, 50 | ConsumerGroupScrapeModeAdminAPI) 51 | } 52 | 53 | switch c.Granularity { 54 | case ConsumerGroupGranularityTopic, ConsumerGroupGranularityPartition: 55 | default: 56 | return fmt.Errorf("invalid consumer group granularity '%v' specified. Valid modes are '%v' or '%v'", 57 | c.Granularity, 58 | ConsumerGroupGranularityTopic, 59 | ConsumerGroupGranularityPartition) 60 | } 61 | 62 | // Check if all group strings are valid regex or literals 63 | for _, groupID := range c.AllowedGroupIDs { 64 | _, err := compileRegex(groupID) 65 | if err != nil { 66 | return fmt.Errorf("allowed group string '%v' is not valid regex", groupID) 67 | } 68 | } 69 | 70 | for _, groupID := range c.IgnoredGroupIDs { 71 | _, err := compileRegex(groupID) 72 | if err != nil { 73 | return fmt.Errorf("ignored group string '%v' is not valid regex", groupID) 74 | } 75 | } 76 | 77 | return nil 78 | } 79 | -------------------------------------------------------------------------------- /minion/config_log_dirs.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | type LogDirsConfig struct { 4 | // Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior 5 | // to version 1.0.0 as describing log dirs was not supported back then. 6 | Enabled bool `koanf:"enabled"` 7 | } 8 | 9 | // Validate if provided LogDirsConfig is valid. 10 | func (c *LogDirsConfig) Validate() error { 11 | return nil 12 | } 13 | 14 | // SetDefaults for topic config 15 | func (c *LogDirsConfig) SetDefaults() { 16 | c.Enabled = true 17 | } 18 | -------------------------------------------------------------------------------- /minion/config_topic_config.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | const ( 8 | TopicGranularityTopic string = "topic" 9 | TopicGranularityPartition string = "partition" 10 | ) 11 | 12 | type TopicConfig struct { 13 | // Enabled can be set to false in order to not collect any topic metrics at all. 14 | Enabled bool `koanf:"enabled"` 15 | 16 | // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and 17 | // you aren't interested in per partition metrics you could choose "topic". 18 | Granularity string `koanf:"granularity"` 19 | 20 | // AllowedTopics are regex strings of topic names whose topic metrics that shall be exported. 21 | AllowedTopics []string `koanf:"allowedTopics"` 22 | 23 | // IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics 24 | // take precedence over allowed topics. 25 | IgnoredTopics []string `koanf:"ignoredTopics"` 26 | 27 | // InfoMetric configures how the kafka_topic_info metric is populated 28 | InfoMetric InfoMetricConfig `koanf:"infoMetric"` 29 | } 30 | 31 | type InfoMetricConfig struct { 32 | // ConfigKeys configures optional topic configuration keys that should be exported 33 | // as prometheus metric labels. 34 | // By default only "cleanup.policy" is exported 35 | ConfigKeys []string `koanf:"configKeys"` 36 | } 37 | 38 | // Validate if provided TopicConfig is valid. 39 | func (c *TopicConfig) Validate() error { 40 | switch c.Granularity { 41 | case TopicGranularityPartition, TopicGranularityTopic: 42 | default: 43 | return fmt.Errorf("given granularity '%v' is invalid", c.Granularity) 44 | } 45 | 46 | // Check whether each provided string is valid regex 47 | for _, topic := range c.AllowedTopics { 48 | _, err := compileRegex(topic) 49 | if err != nil { 50 | return fmt.Errorf("allowed topic string '%v' is not valid regex", topic) 51 | } 52 | } 53 | 54 | for _, topic := range c.IgnoredTopics { 55 | _, err := compileRegex(topic) 56 | if err != nil { 57 | return fmt.Errorf("ignored topic string '%v' is not valid regex", topic) 58 | } 59 | } 60 | 61 | return nil 62 | } 63 | 64 | // SetDefaults for topic config 65 | func (c *TopicConfig) SetDefaults() { 66 | c.Enabled = true 67 | c.Granularity = TopicGranularityPartition 68 | c.AllowedTopics = []string{"/.*/"} 69 | c.InfoMetric = InfoMetricConfig{ConfigKeys: []string{"cleanup.policy"}} 70 | } 71 | -------------------------------------------------------------------------------- /minion/consumer_group_offsets.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sync" 7 | 8 | "github.com/twmb/franz-go/pkg/kmsg" 9 | "go.uber.org/zap" 10 | "golang.org/x/sync/errgroup" 11 | ) 12 | 13 | // ListAllConsumerGroupOffsetsInternal returns a map from the in memory storage. The map value is the offset commit 14 | // value and is grouped by group id, topic, partition id as keys of the nested maps. 15 | func (s *Service) ListAllConsumerGroupOffsetsInternal() map[string]map[string]map[int32]OffsetCommit { 16 | return s.storage.getGroupOffsets() 17 | } 18 | 19 | // ListAllConsumerGroupOffsetsAdminAPI return all consumer group offsets using Kafka's Admin API. 20 | func (s *Service) ListAllConsumerGroupOffsetsAdminAPI(ctx context.Context) (map[string]*kmsg.OffsetFetchResponse, error) { 21 | groupsRes, err := s.listConsumerGroupsCached(ctx) 22 | if err != nil { 23 | return nil, fmt.Errorf("failed to list groupsRes: %w", err) 24 | } 25 | groupIDs := make([]string, len(groupsRes.Groups)) 26 | for i, group := range groupsRes.Groups { 27 | groupIDs[i] = group.Group 28 | } 29 | 30 | return s.listConsumerGroupOffsetsBulk(ctx, groupIDs) 31 | } 32 | 33 | // listConsumerGroupOffsetsBulk returns a map which has the Consumer group name as key 34 | func (s *Service) listConsumerGroupOffsetsBulk(ctx context.Context, groups []string) (map[string]*kmsg.OffsetFetchResponse, error) { 35 | eg, _ := errgroup.WithContext(ctx) 36 | 37 | mutex := sync.Mutex{} 38 | res := make(map[string]*kmsg.OffsetFetchResponse) 39 | 40 | f := func(group string) func() error { 41 | return func() error { 42 | offsets, err := s.listConsumerGroupOffsets(ctx, group) 43 | if err != nil { 44 | s.logger.Warn("failed to fetch consumer group offsets, inner kafka error", 45 | zap.String("consumer_group", group), 46 | zap.Error(err)) 47 | return nil 48 | } 49 | 50 | mutex.Lock() 51 | res[group] = offsets 52 | mutex.Unlock() 53 | return nil 54 | } 55 | } 56 | 57 | for _, group := range groups { 58 | eg.Go(f(group)) 59 | } 60 | 61 | if err := eg.Wait(); err != nil { 62 | return nil, err 63 | } 64 | 65 | return res, nil 66 | } 67 | 68 | // listConsumerGroupOffsets returns the committed group offsets for a single group 69 | func (s *Service) listConsumerGroupOffsets(ctx context.Context, group string) (*kmsg.OffsetFetchResponse, error) { 70 | req := kmsg.NewOffsetFetchRequest() 71 | req.Group = group 72 | req.Topics = nil 73 | res, err := req.RequestWith(ctx, s.client) 74 | if err != nil { 75 | return nil, fmt.Errorf("failed to request group offsets for group '%v': %w", group, err) 76 | } 77 | 78 | return res, nil 79 | } 80 | -------------------------------------------------------------------------------- /minion/describe_consumer_groups.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kerr" 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | type DescribeConsumerGroupsResponse struct { 15 | BrokerMetadata kgo.BrokerMetadata 16 | Groups *kmsg.DescribeGroupsResponse 17 | } 18 | 19 | func (s *Service) listConsumerGroupsCached(ctx context.Context) (*kmsg.ListGroupsResponse, error) { 20 | reqId := ctx.Value("requestId").(string) 21 | key := "list-consumer-groups-" + reqId 22 | 23 | if cachedRes, exists := s.getCachedItem(key); exists { 24 | return cachedRes.(*kmsg.ListGroupsResponse), nil 25 | } 26 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { 27 | res, err := s.listConsumerGroups(ctx) 28 | if err != nil { 29 | return nil, err 30 | } 31 | s.setCachedItem(key, res, 120*time.Second) 32 | 33 | return res, nil 34 | }) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | return res.(*kmsg.ListGroupsResponse), nil 40 | } 41 | 42 | func (s *Service) listConsumerGroups(ctx context.Context) (*kmsg.ListGroupsResponse, error) { 43 | listReq := kmsg.NewListGroupsRequest() 44 | res, err := listReq.RequestWith(ctx, s.client) 45 | if err != nil { 46 | return nil, fmt.Errorf("failed to list consumer groups: %w", err) 47 | } 48 | err = kerr.ErrorForCode(res.ErrorCode) 49 | if err != nil { 50 | return nil, fmt.Errorf("failed to list consumer groups. inner kafka error: %w", err) 51 | } 52 | 53 | return res, nil 54 | } 55 | 56 | func (s *Service) DescribeConsumerGroups(ctx context.Context) ([]DescribeConsumerGroupsResponse, error) { 57 | listRes, err := s.listConsumerGroupsCached(ctx) 58 | if err != nil { 59 | return nil, err 60 | } 61 | 62 | groupIDs := make([]string, len(listRes.Groups)) 63 | for i, group := range listRes.Groups { 64 | groupIDs[i] = group.Group 65 | } 66 | 67 | describeReq := kmsg.NewDescribeGroupsRequest() 68 | describeReq.Groups = groupIDs 69 | describeReq.IncludeAuthorizedOperations = false 70 | shardedResp := s.client.RequestSharded(ctx, &describeReq) 71 | 72 | describedGroups := make([]DescribeConsumerGroupsResponse, 0) 73 | for _, kresp := range shardedResp { 74 | if kresp.Err != nil { 75 | s.logger.Warn("broker failed to respond to the described groups request", 76 | zap.Int32("broker_id", kresp.Meta.NodeID), 77 | zap.Error(kresp.Err)) 78 | continue 79 | } 80 | res := kresp.Resp.(*kmsg.DescribeGroupsResponse) 81 | 82 | describedGroups = append(describedGroups, DescribeConsumerGroupsResponse{ 83 | BrokerMetadata: kresp.Meta, 84 | Groups: res, 85 | }) 86 | } 87 | 88 | return describedGroups, nil 89 | } 90 | -------------------------------------------------------------------------------- /minion/describe_topic_config.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/pkg/errors" 8 | "github.com/twmb/franz-go/pkg/kmsg" 9 | ) 10 | 11 | func (s *Service) GetTopicConfigs(ctx context.Context) (*kmsg.DescribeConfigsResponse, error) { 12 | metadata, err := s.GetMetadataCached(ctx) 13 | if err != nil { 14 | return nil, errors.Wrap(err, "failed to get metadata") 15 | } 16 | 17 | req := kmsg.NewDescribeConfigsRequest() 18 | 19 | for _, topic := range metadata.Topics { 20 | resourceReq := kmsg.NewDescribeConfigsRequestResource() 21 | resourceReq.ResourceType = kmsg.ConfigResourceTypeTopic 22 | resourceReq.ResourceName = *topic.Topic 23 | req.Resources = append(req.Resources, resourceReq) 24 | } 25 | 26 | res, err := req.RequestWith(ctx, s.client) 27 | if err != nil { 28 | return nil, fmt.Errorf("failed to request metadata: %w", err) 29 | } 30 | 31 | return res, nil 32 | } 33 | -------------------------------------------------------------------------------- /minion/list_offsets.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "strconv" 8 | "time" 9 | 10 | "github.com/twmb/franz-go/pkg/kadm" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | func (s *Service) ListOffsetsCached(ctx context.Context, timestamp int64) (kadm.ListedOffsets, error) { 15 | reqId := ctx.Value("requestId").(string) 16 | key := "partition-offsets-" + strconv.Itoa(int(timestamp)) + "-" + reqId 17 | 18 | if cachedRes, exists := s.getCachedItem(key); exists { 19 | return cachedRes.(kadm.ListedOffsets), nil 20 | } 21 | 22 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { 23 | offsets, err := s.ListOffsets(ctx, timestamp) 24 | if err != nil { 25 | return nil, err 26 | } 27 | 28 | s.setCachedItem(key, offsets, 120*time.Second) 29 | 30 | return offsets, nil 31 | }) 32 | if err != nil { 33 | return nil, err 34 | } 35 | 36 | return res.(kadm.ListedOffsets), nil 37 | } 38 | 39 | // ListOffsets fetches the low (timestamp: -2) or high water mark (timestamp: -1) for all topic partitions 40 | func (s *Service) ListOffsets(ctx context.Context, timestamp int64) (kadm.ListedOffsets, error) { 41 | listedOffsets, err := s.admClient.ListEndOffsets(ctx) 42 | if err != nil { 43 | var se *kadm.ShardErrors 44 | if !errors.As(err, &se) { 45 | return nil, fmt.Errorf("failed to list offsets: %w", err) 46 | } 47 | 48 | if se.AllFailed { 49 | return nil, fmt.Errorf("failed to list offsets, all shard responses failed: %w", err) 50 | } 51 | s.logger.Info("failed to list offset from some shards", zap.Int("failed_shards", len(se.Errs))) 52 | for _, shardErr := range se.Errs { 53 | s.logger.Warn("shard error for listing end offsets", 54 | zap.Int32("broker_id", shardErr.Broker.NodeID), 55 | zap.Error(shardErr.Err)) 56 | } 57 | } 58 | 59 | // Log inner errors before returning them. We do that inside of this function to avoid duplicate logging as the response 60 | // are cached for each scrape anyways. 61 | // 62 | // Create two metrics to aggregate error logs in few messages. Logging one message per occured partition error 63 | // is too much. Typical errors are LEADER_NOT_AVAILABLE etc. 64 | errorCountByErrCode := make(map[error]int) 65 | errorCountByTopic := make(map[string]int) 66 | 67 | // Iterate on all partitions 68 | listedOffsets.Each(func(offset kadm.ListedOffset) { 69 | if offset.Err != nil { 70 | errorCountByTopic[offset.Topic]++ 71 | errorCountByErrCode[offset.Err]++ 72 | } 73 | }) 74 | 75 | // Print log line for each error type 76 | for err, count := range errorCountByErrCode { 77 | s.logger.Warn("failed to list some partitions watermarks", 78 | zap.Error(err), 79 | zap.Int("error_count", count)) 80 | } 81 | if len(errorCountByTopic) > 0 { 82 | s.logger.Warn("some topics had one or more partitions whose watermarks could not be fetched from Kafka", 83 | zap.Int("topics_with_errors", len(errorCountByTopic))) 84 | } 85 | 86 | return listedOffsets, nil 87 | } 88 | -------------------------------------------------------------------------------- /minion/log_dirs.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/twmb/franz-go/pkg/kgo" 7 | "github.com/twmb/franz-go/pkg/kmsg" 8 | ) 9 | 10 | type LogDirResponseShard struct { 11 | Err error 12 | Broker kgo.BrokerMetadata 13 | LogDirs *kmsg.DescribeLogDirsResponse 14 | } 15 | 16 | func (s *Service) DescribeLogDirs(ctx context.Context) []LogDirResponseShard { 17 | req := kmsg.NewDescribeLogDirsRequest() 18 | req.Topics = nil // Describe all topics 19 | responses := s.client.RequestSharded(ctx, &req) 20 | 21 | res := make([]LogDirResponseShard, len(responses)) 22 | for i, responseShard := range responses { 23 | logDirs, ok := responseShard.Resp.(*kmsg.DescribeLogDirsResponse) 24 | if !ok { 25 | logDirs = &kmsg.DescribeLogDirsResponse{} 26 | } 27 | 28 | res[i] = LogDirResponseShard{ 29 | Err: responseShard.Err, 30 | Broker: responseShard.Meta, 31 | LogDirs: logDirs, 32 | } 33 | } 34 | 35 | return res 36 | } 37 | -------------------------------------------------------------------------------- /minion/metadata.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kmsg" 9 | ) 10 | 11 | func (s *Service) GetMetadataCached(ctx context.Context) (*kmsg.MetadataResponse, error) { 12 | reqId := ctx.Value("requestId").(string) 13 | key := "metadata-" + reqId 14 | 15 | if cachedRes, exists := s.getCachedItem(key); exists { 16 | return cachedRes.(*kmsg.MetadataResponse), nil 17 | } 18 | 19 | res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { 20 | metadata, err := s.GetMetadata(ctx) 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | s.setCachedItem(key, metadata, 120*time.Second) 26 | 27 | return metadata, nil 28 | }) 29 | if err != nil { 30 | return nil, err 31 | } 32 | 33 | return res.(*kmsg.MetadataResponse), nil 34 | } 35 | 36 | func (s *Service) GetMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) { 37 | req := kmsg.NewMetadataRequest() 38 | req.Topics = nil 39 | 40 | res, err := req.RequestWith(ctx, s.client) 41 | if err != nil { 42 | return nil, fmt.Errorf("failed to request metadata: %w", err) 43 | } 44 | 45 | return res, nil 46 | } 47 | -------------------------------------------------------------------------------- /minion/offset_consumer.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/twmb/franz-go/pkg/kbin" 9 | "github.com/twmb/franz-go/pkg/kerr" 10 | "github.com/twmb/franz-go/pkg/kgo" 11 | "github.com/twmb/franz-go/pkg/kmsg" 12 | "go.uber.org/zap" 13 | ) 14 | 15 | // startConsumingOffsets consumes the __consumer_offsets topic and forwards the kafka messages to their respective 16 | // methods where they'll be decoded and further processed. 17 | func (s *Service) startConsumingOffsets(ctx context.Context) { 18 | client := s.client 19 | 20 | s.logger.Info("starting to consume messages from offsets topic") 21 | go s.checkIfConsumerLagIsCaughtUp(ctx) 22 | 23 | for { 24 | select { 25 | case <-ctx.Done(): 26 | return 27 | default: 28 | fetches := client.PollFetches(ctx) 29 | errors := fetches.Errors() 30 | for _, err := range errors { 31 | // Log all errors and continue afterwards as we might get errors and still have some fetch results 32 | s.logger.Error("failed to fetch records from kafka", 33 | zap.String("topic", err.Topic), 34 | zap.Int32("partition", err.Partition), 35 | zap.Error(err.Err)) 36 | } 37 | 38 | iter := fetches.RecordIter() 39 | for !iter.Done() { 40 | record := iter.Next() 41 | s.storage.markRecordConsumed(record) 42 | 43 | err := s.decodeOffsetRecord(record) 44 | if err != nil { 45 | s.logger.Warn("failed to decode offset record", zap.Error(err)) 46 | } 47 | } 48 | } 49 | } 50 | } 51 | 52 | // checkIfConsumerLagIsCaughtUp fetches the newest partition offsets for all partitions in the __consumer_offsets 53 | // topic and compares these against the last consumed messages from our offset consumer. If the consumed offsets are 54 | // higher than the partition offsets this means we caught up the initial lag and can mark our storage as ready. A ready 55 | // store will start to expose consumer group offsets. 56 | func (s *Service) checkIfConsumerLagIsCaughtUp(ctx context.Context) { 57 | for { 58 | time.Sleep(12 * time.Second) 59 | s.logger.Debug("checking if lag in consumer offsets metadataReqTopic is caught up") 60 | 61 | // 1. Get metadataReqTopic high watermarks for __consumer_offsets metadataReqTopic 62 | metadataReq := kmsg.NewMetadataRequest() 63 | metadataReqTopic := kmsg.NewMetadataRequestTopic() 64 | topicName := "__consumer_offsets" 65 | metadataReqTopic.Topic = &topicName 66 | metadataReq.Topics = []kmsg.MetadataRequestTopic{metadataReqTopic} 67 | 68 | res, err := metadataReq.RequestWith(ctx, s.client) 69 | if err != nil { 70 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because metadata request failed", 71 | zap.Error(err)) 72 | continue 73 | } 74 | 75 | // 2. Request high watermarks for consumer offset partitions 76 | topicReqs := make([]kmsg.ListOffsetsRequestTopic, len(res.Topics)) 77 | for i, topic := range res.Topics { 78 | req := kmsg.NewListOffsetsRequestTopic() 79 | req.Topic = *topic.Topic 80 | 81 | partitionReqs := make([]kmsg.ListOffsetsRequestTopicPartition, len(topic.Partitions)) 82 | for j, partition := range topic.Partitions { 83 | partitionReqs[j] = kmsg.NewListOffsetsRequestTopicPartition() 84 | partitionReqs[j].Partition = partition.Partition 85 | partitionReqs[j].Timestamp = -1 // Newest 86 | } 87 | req.Partitions = partitionReqs 88 | 89 | topicReqs[i] = req 90 | } 91 | offsetReq := kmsg.NewListOffsetsRequest() 92 | offsetReq.Topics = topicReqs 93 | highMarksRes, err := offsetReq.RequestWith(ctx, s.client) 94 | if err != nil { 95 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high watermark request failed", 96 | zap.Error(err)) 97 | continue 98 | } 99 | if len(highMarksRes.Topics) != 1 { 100 | s.logger.Error("expected exactly one metadataReqTopic response for high water mark request") 101 | continue 102 | } 103 | 104 | // 3. Check if high watermarks have been consumed. To avoid a race condition here we will wait some time before 105 | // comparing, so that the consumer has enough time to catch up to the new high watermarks we just fetched. 106 | time.Sleep(3 * time.Second) 107 | consumedOffsets := s.storage.getConsumedOffsets() 108 | topicRes := highMarksRes.Topics[0] 109 | isReady := true 110 | 111 | type laggingParition struct { 112 | Name string 113 | Id int32 114 | Lag int64 115 | } 116 | var partitionsLagging []laggingParition 117 | totalLag := int64(0) 118 | for _, partition := range topicRes.Partitions { 119 | err := kerr.ErrorForCode(partition.ErrorCode) 120 | if err != nil { 121 | s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high "+ 122 | "watermark request failed, with an inner error", 123 | zap.Error(err)) 124 | } 125 | 126 | highWaterMark := partition.Offset - 1 127 | consumedOffset := consumedOffsets[partition.Partition] 128 | partitionLag := highWaterMark - consumedOffset 129 | if partitionLag < 0 { 130 | partitionLag = 0 131 | } 132 | 133 | if partitionLag > 0 { 134 | partitionsLagging = append(partitionsLagging, laggingParition{ 135 | Name: topicRes.Topic, 136 | Id: partition.Partition, 137 | Lag: partitionLag, 138 | }) 139 | totalLag += partitionLag 140 | s.logger.Debug("consumer_offsets metadataReqTopic lag has not been caught up yet", 141 | zap.Int32("partition_id", partition.Partition), 142 | zap.Int64("high_water_mark", highWaterMark), 143 | zap.Int64("consumed_offset", consumedOffset), 144 | zap.Int64("partition_lag", partitionLag)) 145 | isReady = false 146 | continue 147 | } 148 | } 149 | if isReady { 150 | s.logger.Info("successfully consumed all consumer offsets. consumer group lags will be exported from now on") 151 | s.storage.setReadyState(true) 152 | return 153 | } else { 154 | s.logger.Info("catching up the message lag on consumer offsets", 155 | zap.Int("lagging_partitions_count", len(partitionsLagging)), 156 | zap.Any("lagging_partitions", partitionsLagging), 157 | zap.Int64("total_lag", totalLag)) 158 | } 159 | } 160 | } 161 | 162 | // decodeOffsetRecord decodes all messages in the consumer offsets topic by routing records to the correct decoding 163 | // method. 164 | func (s *Service) decodeOffsetRecord(record *kgo.Record) error { 165 | if len(record.Key) < 2 { 166 | return fmt.Errorf("offset commit key is supposed to be at least 2 bytes long") 167 | } 168 | messageVer := (&kbin.Reader{Src: record.Key}).Int16() 169 | 170 | switch messageVer { 171 | case 0, 1: 172 | err := s.decodeOffsetCommit(record) 173 | if err != nil { 174 | return err 175 | } 176 | case 2: 177 | err := s.decodeOffsetMetadata(record) 178 | if err != nil { 179 | return err 180 | } 181 | } 182 | 183 | return nil 184 | } 185 | 186 | // decodeOffsetMetadata decodes to metadata which includes the following information: 187 | // - group 188 | // - protocolType (connect/consumer/...) 189 | // - generation 190 | // - protocol 191 | // - currentStateTimestamp 192 | // - groupMembers (member metadata such aus: memberId, groupInstanceId, clientId, clientHost, rebalanceTimeout, ...) 193 | func (s *Service) decodeOffsetMetadata(record *kgo.Record) error { 194 | childLogger := s.logger.With( 195 | zap.String("topic", record.Topic), 196 | zap.Int32("partition_id", record.Partition), 197 | zap.Int64("offset", record.Offset)) 198 | 199 | metadataKey := kmsg.NewGroupMetadataKey() 200 | err := metadataKey.ReadFrom(record.Key) 201 | if err != nil { 202 | childLogger.Warn("failed to decode offset metadata key", zap.Error(err)) 203 | return fmt.Errorf("failed to decode offset metadata key: %w", err) 204 | } 205 | 206 | if record.Value == nil { 207 | return nil 208 | } 209 | metadataValue := kmsg.NewGroupMetadataValue() 210 | err = metadataValue.ReadFrom(record.Value) 211 | if err != nil { 212 | childLogger.Warn("failed to decode offset metadata value", zap.Error(err)) 213 | return fmt.Errorf("failed to decode offset metadata value: %w", err) 214 | } 215 | 216 | return nil 217 | } 218 | 219 | // decodeOffsetCommit decodes to group offsets which include the following information: 220 | // - group, topic, partition 221 | // - offset 222 | // - leaderEpoch 223 | // - metadata (user specified string for each offset commit) 224 | // - commitTimestamp 225 | // - expireTimestamp (only version 1 offset commits / deprecated) 226 | func (s *Service) decodeOffsetCommit(record *kgo.Record) error { 227 | childLogger := s.logger.With( 228 | zap.String("topic", record.Topic), 229 | zap.Int32("partition_id", record.Partition), 230 | zap.Int64("offset", record.Offset)) 231 | offsetCommitKey := kmsg.NewOffsetCommitKey() 232 | err := offsetCommitKey.ReadFrom(record.Key) 233 | if err != nil { 234 | childLogger.Warn("failed to decode offset commit key", zap.Error(err)) 235 | return fmt.Errorf("failed to decode offset commit key: %w", err) 236 | } 237 | 238 | if record.Value == nil { 239 | // Tombstone - The group offset is expired or no longer valid (e.g. because the topic has been deleted) 240 | s.storage.deleteOffsetCommit(offsetCommitKey) 241 | return nil 242 | } 243 | 244 | offsetCommitValue := kmsg.NewOffsetCommitValue() 245 | err = offsetCommitValue.ReadFrom(record.Value) 246 | if err != nil { 247 | childLogger.Warn("failed to decode offset commit value", zap.Error(err)) 248 | return fmt.Errorf("failed to decode offset commit value: %w", err) 249 | } 250 | s.storage.addOffsetCommit(offsetCommitKey, offsetCommitValue) 251 | 252 | return nil 253 | } 254 | 255 | func (s *Service) GetNumberOfOffsetRecordsConsumed() float64 { 256 | return s.storage.getNumberOfConsumedRecords() 257 | } 258 | -------------------------------------------------------------------------------- /minion/service.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | "regexp" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "github.com/twmb/franz-go/pkg/kadm" 14 | "github.com/twmb/franz-go/pkg/kgo" 15 | "github.com/twmb/franz-go/pkg/kmsg" 16 | "github.com/twmb/franz-go/pkg/kversion" 17 | "go.uber.org/zap" 18 | "golang.org/x/sync/singleflight" 19 | 20 | "github.com/cloudhut/kminion/v2/kafka" 21 | ) 22 | 23 | type Service struct { 24 | Cfg Config 25 | logger *zap.Logger 26 | 27 | // requestGroup is used to deduplicate multiple concurrent requests to kafka 28 | requestGroup *singleflight.Group 29 | cache map[string]interface{} 30 | cacheLock sync.RWMutex 31 | 32 | AllowedGroupIDsExpr []*regexp.Regexp 33 | IgnoredGroupIDsExpr []*regexp.Regexp 34 | AllowedTopicsExpr []*regexp.Regexp 35 | IgnoredTopicsExpr []*regexp.Regexp 36 | 37 | client *kgo.Client 38 | admClient *kadm.Client 39 | storage *Storage 40 | } 41 | 42 | func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricsNamespace string, ctx context.Context) (*Service, error) { 43 | storage, err := newStorage(logger) 44 | if err != nil { 45 | return nil, fmt.Errorf("failed to create storage: %w", err) 46 | } 47 | 48 | // Kafka client 49 | minionHooks := newMinionClientHooks(logger.Named("kafka_hooks"), metricsNamespace) 50 | kgoOpts := []kgo.Opt{ 51 | kgo.WithHooks(minionHooks), 52 | } 53 | if cfg.ConsumerGroups.Enabled && cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic { 54 | kgoOpts = append(kgoOpts, 55 | kgo.ConsumeResetOffset(kgo.NewOffset().AtStart()), 56 | kgo.ConsumeTopics("__consumer_offsets")) 57 | } 58 | 59 | logger.Info("connecting to Kafka seed brokers, trying to fetch cluster metadata", 60 | zap.String("seed_brokers", strings.Join(kafkaSvc.Brokers(), ","))) 61 | 62 | client, err := kafkaSvc.CreateAndTestClient(ctx, logger, kgoOpts) 63 | if err != nil { 64 | return nil, fmt.Errorf("failed to create kafka client: %w", err) 65 | } 66 | logger.Info("successfully connected to kafka cluster") 67 | 68 | // Compile regexes. We can ignore the errors because valid compilation has been validated already 69 | allowedGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.AllowedGroupIDs) 70 | ignoredGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.IgnoredGroupIDs) 71 | allowedTopicsExpr, _ := compileRegexes(cfg.Topics.AllowedTopics) 72 | ignoredTopicsExpr, _ := compileRegexes(cfg.Topics.IgnoredTopics) 73 | 74 | service := &Service{ 75 | Cfg: cfg, 76 | logger: logger.Named("minion_service"), 77 | 78 | requestGroup: &singleflight.Group{}, 79 | cache: make(map[string]interface{}), 80 | cacheLock: sync.RWMutex{}, 81 | 82 | AllowedGroupIDsExpr: allowedGroupIDsExpr, 83 | IgnoredGroupIDsExpr: ignoredGroupIDsExpr, 84 | AllowedTopicsExpr: allowedTopicsExpr, 85 | IgnoredTopicsExpr: ignoredTopicsExpr, 86 | 87 | client: client, 88 | admClient: kadm.NewClient(client), 89 | 90 | storage: storage, 91 | } 92 | 93 | return service, nil 94 | } 95 | 96 | func (s *Service) Start(ctx context.Context) error { 97 | err := s.ensureCompatibility(ctx) 98 | if err != nil { 99 | return fmt.Errorf("failed to check feature compatibility against Kafka: %w", err) 100 | } 101 | 102 | if s.Cfg.ConsumerGroups.Enabled && s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic { 103 | go s.startConsumingOffsets(ctx) 104 | } 105 | 106 | return nil 107 | } 108 | 109 | func (s *Service) isReady() bool { 110 | if s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeAdminAPI { 111 | return true 112 | } 113 | 114 | return s.storage.isReady() 115 | } 116 | 117 | func (s *Service) HandleIsReady() http.HandlerFunc { 118 | type response struct { 119 | StatusCode int `json:"statusCode"` 120 | } 121 | return func(w http.ResponseWriter, r *http.Request) { 122 | status := http.StatusOK 123 | if !s.isReady() { 124 | status = http.StatusServiceUnavailable 125 | } 126 | res := response{StatusCode: status} 127 | resJson, _ := json.Marshal(res) 128 | w.WriteHeader(status) 129 | w.Write(resJson) 130 | } 131 | } 132 | 133 | // ensureCompatibility checks whether the options as configured are available in the connected cluster. For example 134 | // we will check if the target Kafka's API version support the LogDirs request. If that's not the case we will 135 | // disable the option and print a warning message. 136 | func (s *Service) ensureCompatibility(ctx context.Context) error { 137 | ctx, cancel := context.WithTimeout(ctx, 15*time.Second) 138 | defer cancel() 139 | versionsRes, err := s.GetAPIVersions(ctx) 140 | if err != nil { 141 | return fmt.Errorf("kafka api versions couldn't be fetched: %w", err) 142 | } 143 | versions := kversion.FromApiVersionsResponse(versionsRes) 144 | 145 | // Check Describe Log Dirs 146 | if s.Cfg.LogDirs.Enabled { 147 | k := kmsg.NewDescribeLogDirsRequest() 148 | isSupported := versions.HasKey(k.Key()) 149 | if !isSupported { 150 | s.logger.Warn("describing log dirs is enabled, but it is not supported because your Kafka cluster " + 151 | "version is too old. feature will be disabled") 152 | s.Cfg.LogDirs.Enabled = false 153 | } 154 | } 155 | 156 | return nil 157 | } 158 | 159 | func (s *Service) getCachedItem(key string) (interface{}, bool) { 160 | s.cacheLock.RLock() 161 | defer s.cacheLock.RUnlock() 162 | 163 | val, exists := s.cache[key] 164 | return val, exists 165 | } 166 | 167 | func (s *Service) setCachedItem(key string, val interface{}, timeout time.Duration) { 168 | s.cacheLock.Lock() 169 | defer s.cacheLock.Unlock() 170 | 171 | go func() { 172 | time.Sleep(timeout) 173 | s.deleteCachedItem(key) 174 | }() 175 | 176 | s.cache[key] = val 177 | } 178 | 179 | func (s *Service) deleteCachedItem(key string) { 180 | s.cacheLock.Lock() 181 | defer s.cacheLock.Unlock() 182 | 183 | delete(s.cache, key) 184 | } 185 | -------------------------------------------------------------------------------- /minion/storage.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "time" 7 | 8 | cmap "github.com/orcaman/concurrent-map" 9 | "github.com/twmb/franz-go/pkg/kgo" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/atomic" 12 | "go.uber.org/zap" 13 | ) 14 | 15 | // Storage stores the current state of all consumer group information that has been consumed using the offset consumer. 16 | type Storage struct { 17 | logger *zap.Logger 18 | 19 | // offsetCommits is a map of all consumer offsets. 20 | // A unique key in the format "group:topic:partition" is used as map key. 21 | // Value is of type OffsetCommit 22 | offsetCommits cmap.ConcurrentMap 23 | 24 | // progressTracker is a map that tracks what offsets in each partition have already been consumed 25 | progressTracker cmap.ConcurrentMap 26 | 27 | isReadyBool *atomic.Bool 28 | 29 | // Number of consumed records (used for a Prometheus metric) 30 | consumedRecords *atomic.Float64 31 | } 32 | 33 | // OffsetCommit is used as value for the OffsetCommit map 34 | type OffsetCommit struct { 35 | Key kmsg.OffsetCommitKey 36 | Value kmsg.OffsetCommitValue 37 | 38 | // CommitCount is the number of offset commits for this group-topic-partition combination 39 | CommitCount int 40 | 41 | // ExpireTimestamp is a timestamp that indicates when this offset commit will expire on the Kafka cluster 42 | ExpireTimestamp time.Time 43 | } 44 | 45 | func newStorage(logger *zap.Logger) (*Storage, error) { 46 | return &Storage{ 47 | logger: logger.Named("storage"), 48 | offsetCommits: cmap.New(), 49 | progressTracker: cmap.New(), 50 | isReadyBool: atomic.NewBool(false), 51 | consumedRecords: atomic.NewFloat64(0), 52 | }, nil 53 | } 54 | 55 | func (s *Storage) isReady() bool { 56 | return s.isReadyBool.Load() 57 | } 58 | 59 | func (s *Storage) setReadyState(isReady bool) { 60 | s.isReadyBool.Store(isReady) 61 | } 62 | 63 | // markRecordConsumed stores the latest consumed offset for each partition. This is necessary in order to figure out 64 | // whether we have caught up the message lag when starting KMinion as we start consuming from the very oldest offset 65 | // commit. 66 | func (s *Storage) markRecordConsumed(rec *kgo.Record) { 67 | key := fmt.Sprintf("%v", rec.Partition) 68 | s.progressTracker.Set(key, rec.Offset) 69 | s.consumedRecords.Add(1) 70 | } 71 | 72 | func (s *Storage) addOffsetCommit(key kmsg.OffsetCommitKey, value kmsg.OffsetCommitValue) { 73 | // For performance reasons we'll store offset commits using a "unique key". Writes happen way more frequently than 74 | // reads (Prometheus scraping the endpoint). Hence we can group everything by group or topic on the read path as 75 | // needed instead of writing it into nested maps like a map[GroupID]map[Topic]map[Partition] 76 | uniqueKey := encodeOffsetCommitKey(key) 77 | 78 | commitCount := 0 79 | commitInterface, exists := s.offsetCommits.Get(uniqueKey) 80 | if exists { 81 | offsetCommit := commitInterface.(OffsetCommit) 82 | commitCount = offsetCommit.CommitCount 83 | } 84 | 85 | timeDay := 24 * time.Hour 86 | commit := OffsetCommit{ 87 | Key: key, 88 | Value: value, 89 | CommitCount: commitCount + 1, 90 | ExpireTimestamp: time.Unix(0, value.CommitTimestamp*int64(time.Millisecond)).Add(7 * timeDay), 91 | } 92 | s.offsetCommits.Set(uniqueKey, commit) 93 | } 94 | 95 | func (s *Storage) getConsumedOffsets() map[int32]int64 { 96 | offsetsByPartition := make(map[int32]int64) 97 | offsets := s.progressTracker.Items() 98 | for partitionID, offsetStr := range offsets { 99 | val := offsetStr.(int64) 100 | partitionID, _ := strconv.ParseInt(partitionID, 10, 32) 101 | offsetsByPartition[int32(partitionID)] = val 102 | } 103 | 104 | return offsetsByPartition 105 | } 106 | 107 | func (s *Storage) getNumberOfConsumedRecords() float64 { 108 | return s.consumedRecords.Load() 109 | } 110 | 111 | func (s *Storage) getGroupOffsets() map[string]map[string]map[int32]OffsetCommit { 112 | // Offsets by group, topic, partition 113 | offsetsByGroup := make(map[string]map[string]map[int32]OffsetCommit) 114 | 115 | if !s.isReady() { 116 | s.logger.Info("Tried to fetch consumer group offsets, but haven't consumed the whole topic yet") 117 | return offsetsByGroup 118 | } 119 | 120 | offsets := s.offsetCommits.Items() 121 | for _, offset := range offsets { 122 | val := offset.(OffsetCommit) 123 | 124 | // Initialize inner maps as necessary 125 | if _, exists := offsetsByGroup[val.Key.Group]; !exists { 126 | offsetsByGroup[val.Key.Group] = make(map[string]map[int32]OffsetCommit) 127 | } 128 | if _, exists := offsetsByGroup[val.Key.Group][val.Key.Topic]; !exists { 129 | offsetsByGroup[val.Key.Group][val.Key.Topic] = make(map[int32]OffsetCommit) 130 | } 131 | 132 | offsetsByGroup[val.Key.Group][val.Key.Topic][val.Key.Partition] = val 133 | } 134 | 135 | return offsetsByGroup 136 | } 137 | 138 | func (s *Storage) deleteOffsetCommit(key kmsg.OffsetCommitKey) { 139 | uniqueKey := encodeOffsetCommitKey(key) 140 | s.offsetCommits.Remove(uniqueKey) 141 | } 142 | 143 | func encodeOffsetCommitKey(key kmsg.OffsetCommitKey) string { 144 | return fmt.Sprintf("%v:%v:%v", key.Group, key.Topic, key.Partition) 145 | } 146 | -------------------------------------------------------------------------------- /minion/utils.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | func (s *Service) IsGroupAllowed(groupName string) bool { 10 | isAllowed := false 11 | for _, regex := range s.AllowedGroupIDsExpr { 12 | if regex.MatchString(groupName) { 13 | isAllowed = true 14 | break 15 | } 16 | } 17 | 18 | for _, regex := range s.IgnoredGroupIDsExpr { 19 | if regex.MatchString(groupName) { 20 | isAllowed = false 21 | break 22 | } 23 | } 24 | return isAllowed 25 | } 26 | 27 | func (s *Service) IsTopicAllowed(topicName string) bool { 28 | isAllowed := false 29 | for _, regex := range s.AllowedTopicsExpr { 30 | if regex.MatchString(topicName) { 31 | isAllowed = true 32 | break 33 | } 34 | } 35 | 36 | for _, regex := range s.IgnoredTopicsExpr { 37 | if regex.MatchString(topicName) { 38 | isAllowed = false 39 | break 40 | } 41 | } 42 | return isAllowed 43 | } 44 | 45 | func compileRegex(expr string) (*regexp.Regexp, error) { 46 | if strings.HasPrefix(expr, "/") && strings.HasSuffix(expr, "/") { 47 | substr := expr[1 : len(expr)-1] 48 | regex, err := regexp.Compile(substr) 49 | if err != nil { 50 | return nil, err 51 | } 52 | 53 | return regex, nil 54 | } 55 | 56 | // If this is no regex input (which is marked by the slashes around it) then we escape it so that it's a literal 57 | regex, err := regexp.Compile("^" + expr + "$") 58 | if err != nil { 59 | return nil, err 60 | } 61 | return regex, nil 62 | } 63 | 64 | func compileRegexes(expr []string) ([]*regexp.Regexp, error) { 65 | compiledExpressions := make([]*regexp.Regexp, len(expr)) 66 | for i, exprStr := range expr { 67 | expr, err := compileRegex(exprStr) 68 | if err != nil { 69 | return nil, fmt.Errorf("failed to compile expression string '%v': %w", exprStr, err) 70 | } 71 | compiledExpressions[i] = expr 72 | } 73 | 74 | return compiledExpressions, nil 75 | } 76 | -------------------------------------------------------------------------------- /minion/versions.go: -------------------------------------------------------------------------------- 1 | package minion 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/twmb/franz-go/pkg/kerr" 8 | "github.com/twmb/franz-go/pkg/kmsg" 9 | "github.com/twmb/franz-go/pkg/kversion" 10 | ) 11 | 12 | func (s *Service) GetClusterVersion(ctx context.Context) (string, error) { 13 | res, err := s.GetAPIVersions(ctx) 14 | if err != nil { 15 | return "", err 16 | } 17 | 18 | versions := kversion.FromApiVersionsResponse(res) 19 | return versions.VersionGuess(), nil 20 | } 21 | 22 | func (s *Service) GetAPIVersions(ctx context.Context) (*kmsg.ApiVersionsResponse, error) { 23 | versionsReq := kmsg.NewApiVersionsRequest() 24 | versionsReq.ClientSoftwareName = "kminion" 25 | versionsReq.ClientSoftwareVersion = "v2" 26 | res, err := versionsReq.RequestWith(ctx, s.client) 27 | if err != nil { 28 | return nil, fmt.Errorf("failed to request api versions: %w", err) 29 | } 30 | 31 | err = kerr.ErrorForCode(res.ErrorCode) 32 | if err != nil { 33 | return nil, fmt.Errorf("failed to request api versions. Inner kafka error: %w", err) 34 | } 35 | 36 | return res, nil 37 | } 38 | -------------------------------------------------------------------------------- /prometheus/collect_broker_info.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "github.com/prometheus/client_golang/prometheus" 6 | "go.uber.org/zap" 7 | "strconv" 8 | ) 9 | 10 | func (e *Exporter) collectBrokerInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { 11 | metadata, err := e.minionSvc.GetMetadataCached(ctx) 12 | if err != nil { 13 | e.logger.Error("failed to get kafka metadata", zap.Error(err)) 14 | return false 15 | } 16 | 17 | for _, broker := range metadata.Brokers { 18 | rack := "" 19 | if broker.Rack != nil { 20 | rack = *broker.Rack 21 | } 22 | 23 | isController := metadata.ControllerID == broker.NodeID 24 | ch <- prometheus.MustNewConstMetric( 25 | e.brokerInfo, 26 | prometheus.GaugeValue, 27 | 1, 28 | strconv.Itoa(int(broker.NodeID)), 29 | broker.Host, 30 | strconv.Itoa(int(broker.Port)), 31 | rack, 32 | strconv.FormatBool(isController), 33 | ) 34 | } 35 | 36 | return true 37 | } 38 | -------------------------------------------------------------------------------- /prometheus/collect_cluster_info.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "github.com/prometheus/client_golang/prometheus" 6 | "go.uber.org/zap" 7 | "strconv" 8 | ) 9 | 10 | func (e *Exporter) collectClusterInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { 11 | version, err := e.minionSvc.GetClusterVersion(ctx) 12 | if err != nil { 13 | e.logger.Error("failed to get kafka cluster version", zap.Error(err)) 14 | return false 15 | } 16 | 17 | metadata, err := e.minionSvc.GetMetadataCached(ctx) 18 | if err != nil { 19 | e.logger.Error("failed to get kafka metadata", zap.Error(err)) 20 | return false 21 | } 22 | brokerCount := len(metadata.Brokers) 23 | clusterID := "" 24 | if metadata.ClusterID != nil { 25 | clusterID = *metadata.ClusterID 26 | } 27 | 28 | ch <- prometheus.MustNewConstMetric( 29 | e.clusterInfo, 30 | prometheus.GaugeValue, 31 | 1, 32 | version, 33 | strconv.Itoa(brokerCount), 34 | strconv.Itoa(int(metadata.ControllerID)), 35 | clusterID, 36 | ) 37 | return true 38 | } 39 | -------------------------------------------------------------------------------- /prometheus/collect_consumer_group_lags.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "math" 6 | "strconv" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/twmb/franz-go/pkg/kadm" 10 | "github.com/twmb/franz-go/pkg/kerr" 11 | "go.uber.org/zap" 12 | 13 | "github.com/cloudhut/kminion/v2/minion" 14 | ) 15 | 16 | type waterMark struct { 17 | TopicName string 18 | PartitionID int32 19 | LowWaterMark int64 20 | HighWaterMark int64 21 | } 22 | 23 | func (e *Exporter) collectConsumerGroupLags(ctx context.Context, ch chan<- prometheus.Metric) bool { 24 | if !e.minionSvc.Cfg.ConsumerGroups.Enabled { 25 | return true 26 | } 27 | 28 | // Low Watermarks (at the moment they are not needed at all, they could be used to calculate the lag on partitions 29 | // that don't have any active offsets) 30 | lowWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -2) 31 | if err != nil { 32 | e.logger.Error("failed to fetch low water marks", zap.Error(err)) 33 | return false 34 | } 35 | // High Watermarks 36 | highWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -1) 37 | if err != nil { 38 | e.logger.Error("failed to fetch low water marks", zap.Error(err)) 39 | return false 40 | } 41 | waterMarksByTopic := e.waterMarksByTopic(lowWaterMarks, highWaterMarks) 42 | 43 | // We have two different options to get consumer group offsets - either via the AdminAPI or by consuming the 44 | // __consumer_offsets topic. 45 | if e.minionSvc.Cfg.ConsumerGroups.ScrapeMode == minion.ConsumerGroupScrapeModeAdminAPI { 46 | return e.collectConsumerGroupLagsAdminAPI(ctx, ch, waterMarksByTopic) 47 | } else { 48 | return e.collectConsumerGroupLagsOffsetTopic(ctx, ch, waterMarksByTopic) 49 | } 50 | } 51 | 52 | func (e *Exporter) collectConsumerGroupLagsOffsetTopic(_ context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool { 53 | offsets := e.minionSvc.ListAllConsumerGroupOffsetsInternal() 54 | for groupName, group := range offsets { 55 | if !e.minionSvc.IsGroupAllowed(groupName) { 56 | continue 57 | } 58 | offsetCommits := 0 59 | 60 | for topicName, topic := range group { 61 | topicLag := float64(0) 62 | topicOffsetSum := float64(0) 63 | for partitionID, partition := range topic { 64 | childLogger := e.logger.With( 65 | zap.String("consumer_group", groupName), 66 | zap.String("topic_name", topicName), 67 | zap.Int32("partition_id", partitionID), 68 | zap.Int64("group_offset", partition.Value.Offset)) 69 | 70 | topicMark, exists := marks[topicName] 71 | if !exists { 72 | childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for") 73 | break // We can stop trying to find any other offsets for that topic so let's quit this loop 74 | } 75 | partitionMark, exists := topicMark[partitionID] 76 | if !exists { 77 | childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for") 78 | continue 79 | } 80 | lag := float64(partitionMark.HighWaterMark - partition.Value.Offset) 81 | // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a 82 | // race condition. Negative lags obviously do not make sense so use at least 0 as lag. 83 | lag = math.Max(0, lag) 84 | topicLag += lag 85 | topicOffsetSum += float64(partition.Value.Offset) 86 | 87 | // Offset commit count for this consumer group 88 | offsetCommits += partition.CommitCount 89 | 90 | if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic { 91 | continue 92 | } 93 | ch <- prometheus.MustNewConstMetric( 94 | e.consumerGroupTopicPartitionLag, 95 | prometheus.GaugeValue, 96 | lag, 97 | groupName, 98 | topicName, 99 | strconv.Itoa(int(partitionID)), 100 | ) 101 | } 102 | ch <- prometheus.MustNewConstMetric( 103 | e.consumerGroupTopicLag, 104 | prometheus.GaugeValue, 105 | topicLag, 106 | groupName, 107 | topicName, 108 | ) 109 | ch <- prometheus.MustNewConstMetric( 110 | e.consumerGroupTopicOffsetSum, 111 | prometheus.GaugeValue, 112 | topicOffsetSum, 113 | groupName, 114 | topicName, 115 | ) 116 | } 117 | 118 | ch <- prometheus.MustNewConstMetric( 119 | e.offsetCommits, 120 | prometheus.CounterValue, 121 | float64(offsetCommits), 122 | groupName, 123 | ) 124 | } 125 | return true 126 | } 127 | 128 | func (e *Exporter) collectConsumerGroupLagsAdminAPI(ctx context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool { 129 | isOk := true 130 | 131 | groupOffsets, err := e.minionSvc.ListAllConsumerGroupOffsetsAdminAPI(ctx) 132 | for groupName, offsetRes := range groupOffsets { 133 | if !e.minionSvc.IsGroupAllowed(groupName) { 134 | continue 135 | } 136 | 137 | err = kerr.ErrorForCode(offsetRes.ErrorCode) 138 | if err != nil { 139 | e.logger.Warn("failed to get offsets from consumer group, inner kafka error", 140 | zap.String("consumer_group", groupName), 141 | zap.Error(err)) 142 | isOk = false 143 | continue 144 | } 145 | for _, topic := range offsetRes.Topics { 146 | topicLag := float64(0) 147 | topicOffsetSum := float64(0) 148 | for _, partition := range topic.Partitions { 149 | err := kerr.ErrorForCode(partition.ErrorCode) 150 | if err != nil { 151 | e.logger.Warn("failed to get consumer group offsets for a partition, inner kafka error", 152 | zap.String("consumer_group", groupName), 153 | zap.Error(err)) 154 | isOk = false 155 | continue 156 | } 157 | 158 | childLogger := e.logger.With( 159 | zap.String("consumer_group", groupName), 160 | zap.String("topic_name", topic.Topic), 161 | zap.Int32("partition_id", partition.Partition), 162 | zap.Int64("group_offset", partition.Offset)) 163 | topicMark, exists := marks[topic.Topic] 164 | if !exists { 165 | childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for") 166 | isOk = false 167 | break // We can stop trying to find any other offsets for that topic so let's quit this loop 168 | } 169 | partitionMark, exists := topicMark[partition.Partition] 170 | if !exists { 171 | childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for") 172 | isOk = false 173 | continue 174 | } 175 | lag := float64(partitionMark.HighWaterMark - partition.Offset) 176 | // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a 177 | // race condition. Negative lags obviously do not make sense so use at least 0 as lag. 178 | lag = math.Max(0, lag) 179 | topicLag += lag 180 | topicOffsetSum += float64(partition.Offset) 181 | 182 | if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic { 183 | continue 184 | } 185 | ch <- prometheus.MustNewConstMetric( 186 | e.consumerGroupTopicPartitionLag, 187 | prometheus.GaugeValue, 188 | lag, 189 | groupName, 190 | topic.Topic, 191 | strconv.Itoa(int(partition.Partition)), 192 | ) 193 | } 194 | 195 | ch <- prometheus.MustNewConstMetric( 196 | e.consumerGroupTopicLag, 197 | prometheus.GaugeValue, 198 | topicLag, 199 | groupName, 200 | topic.Topic, 201 | ) 202 | ch <- prometheus.MustNewConstMetric( 203 | e.consumerGroupTopicOffsetSum, 204 | prometheus.GaugeValue, 205 | topicOffsetSum, 206 | groupName, 207 | topic.Topic, 208 | ) 209 | } 210 | } 211 | return isOk 212 | } 213 | 214 | func (e *Exporter) waterMarksByTopic(lowMarks kadm.ListedOffsets, highMarks kadm.ListedOffsets) map[string]map[int32]waterMark { 215 | type partitionID = int32 216 | type topicName = string 217 | waterMarks := make(map[topicName]map[partitionID]waterMark) 218 | 219 | for topic, lowMarksByPartitionID := range lowMarks { 220 | _, exists := waterMarks[topic] 221 | if !exists { 222 | waterMarks[topic] = make(map[partitionID]waterMark) 223 | } 224 | 225 | for _, lowOffset := range lowMarksByPartitionID { 226 | if lowOffset.Err != nil { 227 | e.logger.Debug("failed to get partition low water mark, inner kafka error", 228 | zap.String("topic_name", lowOffset.Topic), 229 | zap.Int32("partition_id", lowOffset.Partition), 230 | zap.Error(lowOffset.Err)) 231 | continue 232 | } 233 | 234 | higOffset, exists := highMarks.Lookup(lowOffset.Topic, lowOffset.Partition) 235 | if !exists { 236 | e.logger.Error("got low water marks for a topic's partition but no high watermarks", 237 | zap.String("topic_name", lowOffset.Topic), 238 | zap.Int32("partition_id", lowOffset.Partition), 239 | zap.Int64("offset", lowOffset.Offset)) 240 | delete(waterMarks, lowOffset.Topic) 241 | break // Topic watermarks are invalid -> delete & skip this topic 242 | } 243 | if higOffset.Err != nil { 244 | e.logger.Debug("failed to get partition low water mark, inner kafka error", 245 | zap.String("topic_name", lowOffset.Topic), 246 | zap.Int32("partition_id", lowOffset.Partition), 247 | zap.Error(lowOffset.Err)) 248 | continue 249 | } 250 | 251 | waterMarks[lowOffset.Topic][lowOffset.Partition] = waterMark{ 252 | TopicName: lowOffset.Topic, 253 | PartitionID: lowOffset.Partition, 254 | LowWaterMark: lowOffset.Offset, 255 | HighWaterMark: higOffset.Offset, 256 | } 257 | } 258 | } 259 | 260 | return waterMarks 261 | } 262 | -------------------------------------------------------------------------------- /prometheus/collect_consumer_groups.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strconv" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/twmb/franz-go/pkg/kerr" 10 | "github.com/twmb/franz-go/pkg/kmsg" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | func (e *Exporter) collectConsumerGroups(ctx context.Context, ch chan<- prometheus.Metric) bool { 15 | if !e.minionSvc.Cfg.ConsumerGroups.Enabled { 16 | return true 17 | } 18 | groups, err := e.minionSvc.DescribeConsumerGroups(ctx) 19 | if err != nil { 20 | e.logger.Error("failed to collect consumer groups, because Kafka request failed", zap.Error(err)) 21 | return false 22 | } 23 | 24 | // The list of groups may be incomplete due to group coordinators that might fail to respond. We do log an error 25 | // message in that case (in the kafka request method) and groups will not be included in this list. 26 | for _, grp := range groups { 27 | coordinator := grp.BrokerMetadata.NodeID 28 | for _, group := range grp.Groups.Groups { 29 | err := kerr.ErrorForCode(group.ErrorCode) 30 | if err != nil { 31 | e.logger.Warn("failed to describe consumer group, internal kafka error", 32 | zap.Error(err), 33 | zap.String("group_id", group.Group), 34 | ) 35 | continue 36 | } 37 | if !e.minionSvc.IsGroupAllowed(group.Group) { 38 | continue 39 | } 40 | state := 0 41 | if group.State == "Stable" { 42 | state = 1 43 | } 44 | ch <- prometheus.MustNewConstMetric( 45 | e.consumerGroupInfo, 46 | prometheus.GaugeValue, 47 | float64(state), 48 | group.Group, 49 | group.Protocol, 50 | group.ProtocolType, 51 | group.State, 52 | strconv.FormatInt(int64(coordinator), 10), 53 | ) 54 | 55 | // total number of members in consumer groups 56 | ch <- prometheus.MustNewConstMetric( 57 | e.consumerGroupMembers, 58 | prometheus.GaugeValue, 59 | float64(len(group.Members)), 60 | group.Group, 61 | ) 62 | 63 | // iterate all members and build two maps: 64 | // - {topic -> number-of-consumers} 65 | // - {topic -> number-of-partitions-assigned} 66 | topicConsumers := make(map[string]int) 67 | topicPartitionsAssigned := make(map[string]int) 68 | membersWithEmptyAssignment := 0 69 | failedAssignmentsDecode := 0 70 | for _, member := range group.Members { 71 | if len(member.MemberAssignment) == 0 { 72 | membersWithEmptyAssignment++ 73 | continue 74 | } 75 | 76 | kassignment, err := decodeMemberAssignments(group.ProtocolType, member) 77 | if err != nil { 78 | e.logger.Debug("failed to decode consumer group member assignment, internal kafka error", 79 | zap.Error(err), 80 | zap.String("group_id", group.Group), 81 | zap.String("client_id", member.ClientID), 82 | zap.String("member_id", member.MemberID), 83 | zap.String("client_host", member.ClientHost), 84 | ) 85 | failedAssignmentsDecode++ 86 | continue 87 | } 88 | if kassignment == nil { 89 | // This is expected in the case of protocolTypes that don't provide valuable information 90 | continue 91 | } 92 | 93 | if len(kassignment.Topics) == 0 { 94 | membersWithEmptyAssignment++ 95 | } 96 | for _, topic := range kassignment.Topics { 97 | topicConsumers[topic.Topic]++ 98 | topicPartitionsAssigned[topic.Topic] += len(topic.Partitions) 99 | } 100 | } 101 | 102 | if failedAssignmentsDecode > 0 { 103 | e.logger.Error("failed to decode consumer group member assignment, internal kafka error", 104 | zap.Error(err), 105 | zap.String("group_id", group.Group), 106 | zap.Int("assignment_decode_failures", failedAssignmentsDecode), 107 | ) 108 | } 109 | 110 | // number of members with no assignment in a stable consumer group 111 | if membersWithEmptyAssignment > 0 { 112 | ch <- prometheus.MustNewConstMetric( 113 | e.consumerGroupMembersEmpty, 114 | prometheus.GaugeValue, 115 | float64(membersWithEmptyAssignment), 116 | group.Group, 117 | ) 118 | } 119 | // number of members in consumer groups for each topic 120 | for topicName, consumers := range topicConsumers { 121 | ch <- prometheus.MustNewConstMetric( 122 | e.consumerGroupTopicMembers, 123 | prometheus.GaugeValue, 124 | float64(consumers), 125 | group.Group, 126 | topicName, 127 | ) 128 | } 129 | // number of partitions assigned in consumer groups for each topic 130 | for topicName, partitions := range topicPartitionsAssigned { 131 | ch <- prometheus.MustNewConstMetric( 132 | e.consumerGroupAssignedTopicPartitions, 133 | prometheus.GaugeValue, 134 | float64(partitions), 135 | group.Group, 136 | topicName, 137 | ) 138 | } 139 | } 140 | } 141 | return true 142 | } 143 | 144 | func decodeMemberAssignments(protocolType string, member kmsg.DescribeGroupsResponseGroupMember) (*kmsg.ConsumerMemberAssignment, error) { 145 | switch protocolType { 146 | case "consumer": 147 | a := kmsg.NewConsumerMemberAssignment() 148 | if err := a.ReadFrom(member.MemberAssignment); err != nil { 149 | return nil, fmt.Errorf("failed to decode member assignment: %w", err) 150 | } 151 | return &a, nil 152 | case "connect": 153 | return nil, nil 154 | default: 155 | return nil, nil 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /prometheus/collect_exporter_metrics.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "github.com/prometheus/client_golang/prometheus" 6 | ) 7 | 8 | func (e *Exporter) collectExporterMetrics(_ context.Context, ch chan<- prometheus.Metric) bool { 9 | recordsConsumed := e.minionSvc.GetNumberOfOffsetRecordsConsumed() 10 | ch <- prometheus.MustNewConstMetric( 11 | e.offsetConsumerRecordsConsumed, 12 | prometheus.CounterValue, 13 | recordsConsumed, 14 | ) 15 | return true 16 | } 17 | -------------------------------------------------------------------------------- /prometheus/collect_log_dirs.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "github.com/prometheus/client_golang/prometheus" 6 | "github.com/twmb/franz-go/pkg/kerr" 7 | "github.com/twmb/franz-go/pkg/kgo" 8 | "go.uber.org/zap" 9 | "strconv" 10 | ) 11 | 12 | func (e *Exporter) collectLogDirs(ctx context.Context, ch chan<- prometheus.Metric) bool { 13 | if !e.minionSvc.Cfg.LogDirs.Enabled { 14 | return true 15 | } 16 | isOk := true 17 | 18 | sizeByBroker := make(map[kgo.BrokerMetadata]int64) 19 | sizeByTopicName := make(map[string]int64) 20 | 21 | logDirsSharded := e.minionSvc.DescribeLogDirs(ctx) 22 | for _, logDirRes := range logDirsSharded { 23 | childLogger := e.logger.With(zap.String("broker_address", logDirRes.Broker.Host), 24 | zap.String("broker_id", strconv.Itoa(int(logDirRes.Broker.NodeID)))) 25 | 26 | if logDirRes.Err != nil { 27 | childLogger.Error("failed to describe a broker's log dirs", zap.Error(logDirRes.Err)) 28 | isOk = false 29 | continue 30 | } 31 | 32 | for _, dir := range logDirRes.LogDirs.Dirs { 33 | err := kerr.ErrorForCode(dir.ErrorCode) 34 | if err != nil { 35 | childLogger.Error("failed to describe a broker's log dir", 36 | zap.String("log_dir", dir.Dir), 37 | zap.Error(err)) 38 | isOk = false 39 | continue 40 | } 41 | for _, topic := range dir.Topics { 42 | topicSize := int64(0) 43 | for _, partition := range topic.Partitions { 44 | topicSize += partition.Size 45 | } 46 | sizeByTopicName[topic.Topic] += topicSize 47 | sizeByBroker[logDirRes.Broker] += topicSize 48 | } 49 | } 50 | } 51 | 52 | // Report the total log dir size per broker 53 | for broker, size := range sizeByBroker { 54 | rackID := "" 55 | if broker.Rack != nil { 56 | rackID = *broker.Rack 57 | } 58 | ch <- prometheus.MustNewConstMetric( 59 | e.brokerLogDirSize, 60 | prometheus.GaugeValue, 61 | float64(size), 62 | strconv.Itoa(int(broker.NodeID)), 63 | broker.Host, 64 | strconv.Itoa(int(broker.Port)), 65 | rackID, 66 | ) 67 | } 68 | 69 | // If one of the log dir responses returned an error we can not reliably report the topic log dirs, as there might 70 | // be additional data on the brokers that failed to respond. 71 | if !isOk { 72 | return false 73 | } 74 | 75 | // Report the total log dir size per topic 76 | for topicName, size := range sizeByTopicName { 77 | ch <- prometheus.MustNewConstMetric( 78 | e.topicLogDirSize, 79 | prometheus.GaugeValue, 80 | float64(size), 81 | topicName, 82 | ) 83 | } 84 | 85 | return isOk 86 | } 87 | -------------------------------------------------------------------------------- /prometheus/collect_topic_info.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "strconv" 6 | 7 | "github.com/prometheus/client_golang/prometheus" 8 | "github.com/twmb/franz-go/pkg/kerr" 9 | "go.uber.org/zap" 10 | ) 11 | 12 | func (e *Exporter) collectTopicInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { 13 | if !e.minionSvc.Cfg.Topics.Enabled { 14 | return true 15 | } 16 | 17 | metadata, err := e.minionSvc.GetMetadataCached(ctx) 18 | if err != nil { 19 | e.logger.Error("failed to get metadata", zap.Error(err)) 20 | return false 21 | } 22 | 23 | topicConfigs, err := e.minionSvc.GetTopicConfigs(ctx) 24 | if err != nil { 25 | e.logger.Error("failed to get topic configs", zap.Error(err)) 26 | return false 27 | } 28 | 29 | isOk := true 30 | // ConfigsByTopic is indexed by topic name and config resource name (inner key) 31 | configsByTopic := make(map[string]map[string]string) 32 | for _, resource := range topicConfigs.Resources { 33 | configsByTopic[resource.ResourceName] = make(map[string]string) 34 | typedErr := kerr.TypedErrorForCode(resource.ErrorCode) 35 | if typedErr != nil { 36 | isOk = false 37 | e.logger.Warn("failed to get topic config of a specific topic", 38 | zap.String("topic_name", resource.ResourceName), 39 | zap.Error(typedErr)) 40 | continue 41 | } 42 | 43 | for _, config := range resource.Configs { 44 | confVal := "nil" 45 | if config.Value != nil { 46 | confVal = *config.Value 47 | } 48 | configsByTopic[resource.ResourceName][config.Name] = confVal 49 | } 50 | 51 | } 52 | 53 | for _, topic := range metadata.Topics { 54 | topicName := *topic.Topic 55 | if !e.minionSvc.IsTopicAllowed(topicName) { 56 | continue 57 | } 58 | typedErr := kerr.TypedErrorForCode(topic.ErrorCode) 59 | if typedErr != nil { 60 | isOk = false 61 | e.logger.Warn("failed to get metadata of a specific topic", 62 | zap.String("topic_name", topicName), 63 | zap.Error(typedErr)) 64 | continue 65 | } 66 | partitionCount := len(topic.Partitions) 67 | replicationFactor := -1 68 | if partitionCount > 0 { 69 | // It should never be possible to skip this, but just to be safe we'll check this so that we don't cause panics 70 | replicationFactor = len(topic.Partitions[0].Replicas) 71 | } 72 | 73 | var labelsValues []string 74 | labelsValues = append(labelsValues, topicName) 75 | labelsValues = append(labelsValues, strconv.Itoa(partitionCount)) 76 | labelsValues = append(labelsValues, strconv.Itoa(replicationFactor)) 77 | for _, key := range e.minionSvc.Cfg.Topics.InfoMetric.ConfigKeys { 78 | labelsValues = append(labelsValues, getOrDefault(configsByTopic[topicName], key, "N/A")) 79 | } 80 | ch <- prometheus.MustNewConstMetric( 81 | e.topicInfo, 82 | prometheus.GaugeValue, 83 | float64(1), 84 | labelsValues..., 85 | ) 86 | } 87 | return isOk 88 | } 89 | 90 | func getOrDefault(m map[string]string, key string, defaultValue string) string { 91 | if value, exists := m[key]; exists { 92 | return value 93 | } 94 | return defaultValue 95 | } 96 | -------------------------------------------------------------------------------- /prometheus/collect_topic_partition_offsets.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "context" 5 | "strconv" 6 | 7 | "github.com/prometheus/client_golang/prometheus" 8 | "go.uber.org/zap" 9 | 10 | "github.com/cloudhut/kminion/v2/minion" 11 | ) 12 | 13 | func (e *Exporter) collectTopicPartitionOffsets(ctx context.Context, ch chan<- prometheus.Metric) bool { 14 | if !e.minionSvc.Cfg.Topics.Enabled { 15 | return true 16 | } 17 | 18 | isOk := true 19 | 20 | // Low Watermarks 21 | lowWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -2) 22 | if err != nil { 23 | e.logger.Error("failed to fetch low water marks", zap.Error(err)) 24 | return false 25 | } 26 | // High Watermarks 27 | highWaterMarks, err := e.minionSvc.ListOffsetsCached(ctx, -1) 28 | if err != nil { 29 | e.logger.Error("failed to fetch low water marks", zap.Error(err)) 30 | return false 31 | } 32 | 33 | // Process Low Watermarks 34 | 35 | for topicName, partitions := range lowWaterMarks { 36 | if !e.minionSvc.IsTopicAllowed(topicName) { 37 | continue 38 | } 39 | 40 | waterMarkSum := int64(0) 41 | hasErrors := false 42 | for _, offset := range partitions { 43 | if offset.Err != nil { 44 | hasErrors = true 45 | isOk = false 46 | continue 47 | } 48 | waterMarkSum += offset.Offset 49 | // Let's end here if partition metrics shall not be exposed 50 | if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic { 51 | continue 52 | } 53 | ch <- prometheus.MustNewConstMetric( 54 | e.partitionLowWaterMark, 55 | prometheus.GaugeValue, 56 | float64(offset.Offset), 57 | topicName, 58 | strconv.Itoa(int(offset.Partition)), 59 | ) 60 | } 61 | // We only want to report the sum of all partition marks if we receive watermarks from all partition 62 | if !hasErrors { 63 | ch <- prometheus.MustNewConstMetric( 64 | e.topicLowWaterMarkSum, 65 | prometheus.GaugeValue, 66 | float64(waterMarkSum), 67 | topicName, 68 | ) 69 | } 70 | } 71 | 72 | for topicName, partitions := range highWaterMarks { 73 | if !e.minionSvc.IsTopicAllowed(topicName) { 74 | continue 75 | } 76 | waterMarkSum := int64(0) 77 | hasErrors := false 78 | for _, offset := range partitions { 79 | if offset.Err != nil { 80 | hasErrors = true 81 | isOk = false 82 | continue 83 | } 84 | waterMarkSum += offset.Offset 85 | // Let's end here if partition metrics shall not be exposed 86 | if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic { 87 | continue 88 | } 89 | ch <- prometheus.MustNewConstMetric( 90 | e.partitionHighWaterMark, 91 | prometheus.GaugeValue, 92 | float64(offset.Offset), 93 | topicName, 94 | strconv.Itoa(int(offset.Partition)), 95 | ) 96 | } 97 | // We only want to report the sum of all partition marks if we receive watermarks from all partitions 98 | if !hasErrors { 99 | ch <- prometheus.MustNewConstMetric( 100 | e.topicHighWaterMarkSum, 101 | prometheus.GaugeValue, 102 | float64(waterMarkSum), 103 | topicName, 104 | ) 105 | } 106 | } 107 | 108 | return isOk 109 | } 110 | -------------------------------------------------------------------------------- /prometheus/config.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | type Config struct { 4 | Host string `koanf:"host"` 5 | Port int `koanf:"port"` 6 | Namespace string `koanf:"namespace"` 7 | } 8 | 9 | func (c *Config) SetDefaults() { 10 | c.Port = 8080 11 | c.Namespace = "kminion" 12 | } 13 | --------------------------------------------------------------------------------