├── .gitignore
├── 000-template.md
├── 001-move-strimzi-kafka-operators-to-java-11.md
├── 002-documentation-improvements.md
├── 003-remove-deprecated-topic-operator-from-kafka-crd.md
├── 004-github-repository-restructuring.md
├── 005-improving-configurability-of-kafka-listeners.md
├── 006-design-docs.md
├── 007-restarting-kafka-connect-connectors-and-tasks.md
├── 008-tls-encrypt-the-kafka-connect-rest-api.md
├── 009-crd-v1-roadmap.md
├── 010-ui-and-admin-server-security.md
├── 011-strimzi-ui.md
├── 012-admin-server.md
├── 013-kafka-canary.md
├── 014-move-docker-images-to-quay.io.md
├── 015-kafka-connect-build.md
├── 016-modularizing-strimzi-ui.md
├── 017-kafka-topic-encryption.md
├── 018-rest-admin-api.md
├── 019-restruture-the-installation-files.md
├── 020-rename-default-branch-of-strimzi-github-repositories.md
├── 021-special-repository-for-st-clients-based-on-example-clients.md
├── 022-feature-gates.md
├── 023-using-ubi8-as-base-image.md
├── 024-adopt-the-kafka-static-quota-plugin.md
├── 025-control-plain-listener.md
├── 026-service-account-patching.md
├── 027-kubernetes-config-provider.md
├── 028-network-policy-generation-environment-variable.md
├── 029-adopt-the-drain-cleaner-utility.md
├── 030-env-var-config-provider.md
├── 031-statefulset-removal.md
├── 032-custom_authentication_in_kafka_brokers.md
├── 033-service-binding.md
├── 034-deprecate-and-remove-mirror-maker-2-extensions.md
├── 035-rebalance-types-scaling-brokers.md
├── 036-kraft-mode.md
├── 037-pluggable-pod-security-profiles.md
├── 038-optimization-proposal-autoapproval.md
├── 039-reduce-test-clients-images.md
├── 040-refactor-client-examples.md
├── 041-user-operator-configurable-exclusion-of-labels.md
├── 042-remove-bridge-amqp-support.md
├── 043-deprecate-and-remove-jmxtrans.md
├── 044-StrimziPodSets-graduation.md
├── 045-Stable-identities-for-Kafka-Connect-worker-nodes.md
├── 046-kraft-liveness-readiness.md
├── 047-cluster-wide-volume-usage-quota-management.md
├── 048-avoid-broker-restarts-when-in-recovery.md
├── 049-prevent-broker-scale-down-if-it-contains-partition-replicas.md
├── 050-Kafka-Node-Pools.md
├── 051-unidirectional-topic-operator.md
├── 052-k8s-server-side-apply.md
├── 053-record-reconciled-versions.md
├── 054-stopping-kafka-connect-connectors.md
├── 055-infinite-auto-restart-of-Kafka-connectors.md
├── 056-cruise-control-api-users.md
├── 057-run-zk-kraft-clusters-parallel.md
├── 058-deprecate-and-remove-envvar-config-provider.md
├── 059-zk-kraft-migration.md
├── 060-kafka-roller-kraft.md
├── 061-kraft-upgrades-and-downgrades.md
├── 062-UseKRaft-feature-gate-promotion.md
├── 063-pdb-generation-environment-variable.md
├── 064-prometheus-metrics-reporter.md
├── 065-support-tiered-storage.md
├── 066-topic-replication-factor-change.md
├── 067-kraft-jbod-support.md
├── 068-quotas-management.md
├── 069-performance-testing.md
├── 070-dont-fail-reconciliation-in-manual-rolling-update.md
├── 071-deprecate-bridge-openapi-2.md
├── 072-kafkabrige-consumer-producer.md
├── 073-improve-handling-of-CA-renewals-and-replacements-in-client-based-operands.md
├── 074-extend-feature-gates-to-all-operators.md
├── 075-additional-volumes-support.md
├── 076-connector-offsets-support.md
├── 077-support-for-kafka-4.0.md
├── 078-auto-rebalancing-cluster-scaling.md
├── 079-removal-of-mirror-maker-1.md
├── 080-deprecation-and-removal-of-storage-overrides.md
├── 081-unregistration-of-KRaft-nodes.md
├── 082-moving-data-between-two-jbod-disks-using-cruise-control.md
├── 083-mm2-connector-offsets-support.md
├── 084-templating-host-and-advertisedHost-fields.md
├── 085-configure-env-vars-based-on-secrets-or-configmaps.md
├── 086-archive-canary.md
├── 087-monitoring-of-custom-resources.md
├── 088-support-mounting-of-CSI-volumes.md
├── 089-adopt-connect-health-endpoint.md
├── 090-support-dns-config.md
├── 091-add-connect-to-test-container.md
├── 092-integrate-bridge-with-metrics-reporter.md
├── 093-single-step-multi-version-downgrade.md
├── 094-deprecate-secrets-field-in-custom-server-authentication.md
├── 095-add-support-volumeattributesclassname.md
├── 096-split-metrics-reporter-into-modules.md
├── 097-deprecate-OPA-authorization.md
├── 098-rebalance-progress-status.md
├── 099-drop-travis-ci-and-testing-for-ppc-and-s390x.md
├── 100-external-certificate-manager.md
├── 101-redesign-restart-events.md
├── 102-using-image-volumes-to-improve-extensibility-of-Strimzi-operands.md
├── 103-bridge-vertx5-errors.md
├── 104-bridge-vertx5-openapi-3.1.0.md
├── CODE_OF_CONDUCT.md
├── GOVERNANCE.md
├── LICENSE
├── MAINTAINERS.md
├── README.md
├── images
├── 009-authentication-flow.svg
├── 009-scram-sha512-admin-server.png
├── 009-scram-sha512-ui-request-flow.png
├── 011-deployment.png
├── 011-http-session-expiry.svg
├── 011-http-valid-session.svg
├── 011-session-architecture.png
├── 011-topicsdesign.png
├── 011-topology.png
├── 011-ws-handshake-session-expiry.svg
├── 011-ws-message-session-expiry.svg
├── 011-ws-valid-session.svg
├── 017-kafkaenc-overview.png
├── 031-strimzi-with-statefulset.png
├── 031-strimzi-with-strimzipodset.png
├── 031-strimzipodset-controller.png
├── 047-quota-plugin-interactions.excalidraw
├── 047-quota-plugin-interactions.png
├── 048-kafka-roller-current-flow.png
├── 048-kafka-roller-new-flow.png
├── 051-states.png
├── 059-rejected-zk-kraft-migration-fsm-1.png
├── 059-rejected-zk-kraft-migration-fsm-2.png
├── 059-zk-kraft-migration-fsm.png
├── 064-current.png
├── 064-proposal.png
├── 100-cert-renewals.png
├── 100-existing-renew-replace-clientca-certs.png
├── 100-new-cluster-ca-key-replacement.png
├── 100-new-ee-certs.png
└── 100-new-renew-replace-clientsca-cert.png
└── logo
├── cncf-color.png
└── strimzi.png
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 |
--------------------------------------------------------------------------------
/000-template.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | #
4 |
5 | Provide a brief summary of the feature you are proposing to add to Strimzi.
6 |
7 | ## Current situation
8 |
9 | Describe the current capability Strimzi has in this area.
10 |
11 | ## Motivation
12 |
13 | Explain the motivation why this should be added, and what value it brings.
14 |
15 | ## Proposal
16 |
17 | Provide an introduction to the proposal. Use sub sections to call out considerations, possible delivery mechanisms etc.
18 |
19 | ## Affected/not affected projects
20 |
21 | Call out the projects in the Strimzi organisation that are/are not affected by this proposal.
22 |
23 | ## Compatibility
24 |
25 | Call out any future or backwards compatibility considerations this proposal has accounted for.
26 |
27 | ## Rejected alternatives
28 |
29 | Call out options that were considered while creating this proposal, but then later rejected, along with reasons why.
30 |
--------------------------------------------------------------------------------
/001-move-strimzi-kafka-operators-to-java-11.md:
--------------------------------------------------------------------------------
1 | # Move Strimzi Kafka operators to Java 11
2 |
3 | Strimzi Kafka operators are currently developed as a Java 8 project.
4 | For a long time, we have Java 11 in the CI pipelines, but we use java 8 as the language level as well as the runtime in the container images.
5 | Java 11 should be also supported by Apache Kafka, so that should not be a blocker either.
6 | This proposal suggests a roadmap for moving to Java 11 both at the runtime level as well as on the language level.
7 |
8 | ## Motivation
9 |
10 | Java 8 is getting old.
11 | More and more libraries now support only newer versions of Java (lately for example some of the libraries for Open Policy agent integration with Kafka).
12 | Java 11 should also have faster TLS implementation compared to Java 8.
13 | That should be more and more important as we remove the TLS sidecars which were dealing with part of the TLS load and use Java's native TLS support for Zookeeper communication.
14 |
15 | We have also increased effort with maintaining the CI builds for both Java 8 and 11.
16 | So moving to Java 11 should allow us to simplify the CI and have less builds as well.
17 |
18 | ## Proposal
19 |
20 | This proposal suggests a phased approach, where we first use Java 11 as the runtime in our container images and only later also change the language level.
21 | If any unexpected problems appear while using Java 11 as runtime, we can adjust the schedule or change the plan.
22 |
23 | ### Java 11 as runtime
24 |
25 | At first, we should move to Java 11 as the runtime environment.
26 | This should include the following tasks:
27 | * Changing the JRE version in our images to OpenJDK 11
28 | * Changing the CI builds so that the Java 11 build acts as the main one and is responsible fur pushing images, documentation and JARs.
29 | * Update all build documentation (HACKING.md, etc.) with regards to the changes described above.
30 |
31 | This phase should be done immediately after the Strimzi 0.18.0 release, to give us enough time to observe the behavior during development and tests of Strimzi 0.19.0.
32 | Unless no unexpected issues are discovered, the 0.19.0 release will use images based on Java 11.
33 |
34 | ### Java 11 as the language level
35 |
36 | The move to Java 11 as the language level, should include the following tasks:
37 | * Change the Java language level in the Maven build files
38 | * Remove the Java 8 builds
39 | * Update all build documentation (HACKING.md, etc.) with regards to the changes described above.
40 |
41 | After the 0.19.0 release which will use Java 11 as the runtime, we should give users another release cycle to use it and provide feedback.
42 | If no unexpected issues are discovered, this phase should be done immediately after the Strimzi 0.20.0 release.
43 | That would mean that the 0.20.0 release will be still using Java 11 as runtime and Java 8 as a language level.
44 | And that 0.21.0 will use Java 11 as runtime and as language level.
45 |
46 | ## Not affected projects
47 |
48 | This has no impact on the other subprojects such as the [OAuth library](https://github.com/strimzi/strimzi-kafka-oauth) or the [Bridge](https://github.com/strimzi/strimzi-kafka-bridge).
49 |
50 | ## Rejected alternatives
51 |
52 | I considered doing all the changes in one step.
53 | But if any problems arise later, the changes might be hard to revert.
54 | The phased approach allows us to react with more flexibility as we progress and change the approach if needed.
55 |
--------------------------------------------------------------------------------
/003-remove-deprecated-topic-operator-from-kafka-crd.md:
--------------------------------------------------------------------------------
1 | # Remove deprecated Topic Operator deployment in Kafka CRD
2 |
3 | | State |
4 | |-------------|
5 | | Implemented |
6 |
7 | When deploying Topic Operator (Topic Operator) through the Kafka custom resource, there are two different options:
8 |
9 | * Deploying it as part of the Entity Operator (Entity Operator) together with User Operator using `Kafka.spec.entityOperator`
10 |
11 | * Deploying just the Topic Operator using `Kafka.spec.topicOperator`
12 |
13 | `Kafka.spec.topicOperator` is already deprecated.
14 | But it also seems to be broken on three different places:
15 |
16 | * Network policies don't allow the separate Topic Operator to communicate with Zookeeper and Kafka because they do not list the Topic Operator, they only list Entity Operator in the list of allowed pods.
17 |
18 | * Authorization configuration does not list the Topic Operator as super user, so when authorization is enabled - even when it connects to Kafka through the missing network policies - it does not have the rights to list or manage topics in Kafka
19 |
20 | * The RBAC rights for the Topic Operator do not allow the updates of `kafkatopics/status`. So it cannot set the status in the KafkaTopic resource to set any of the errors.
21 |
22 | These issues are present for several releases:
23 |
24 | * Kafka Topic status was added in 0.14.0
25 |
26 | * Super users in 0.16.0 IIRC
27 |
28 | * Network policies since beginning I guess
29 |
30 | And it seems that nobody complained.
31 | That suggests that nobody is using it anymore
32 |
33 | ## Proposed changes
34 |
35 | The feature is already deprecated and seems to not work properly for several releases without anyone noticing.
36 | It can be also easily replaced through the Entity Operator, which offers the same functionality just through a slightly different structure of the Kafka custom resource.
37 |
38 | Therefore I propose to remove this functionality completely from the Cluster Operator code and any related tests and system tests.
39 | It will be replaced with a simple check whether `Kafka.spec.topicOperator` is configured and it will set a condition with a warning:
40 |
41 | ```
42 | - lastTransitionTime: 2020-05-25T19:17:05+0000
43 | message: "Kafka.spec.topicOperator is not supported anymore. Topic operator should be configured at path spec.entityOperator.topicOperator."
44 | status: "True"
45 | type: Warning
46 | ```
47 |
48 | It should stay part of the Kafka custom resource to not cause any existing resources which still specify it to be rejected by Kubernetes.
49 | It should be removed from the Kafka custom resource definition when moving to version `v1beta2` or to `v1`.
50 | At this point also the warning condition will be removed.
51 |
52 | The separate Topic Operator deployment in the Kafka CR should be also removed from the documentation.
53 |
54 | ## Rejected alternatives
55 |
56 | I also considered fixing the issues.
57 | But the feature is already deprecated for a long time and the problems were not noticed / reported for several releases.
58 |
59 | ## Next steps
60 |
61 | If this proposal is approved, following next steps should be done:
62 |
63 | * Remove the code related to `Kafka.spec.topicOperator` from the cluster operator, replace it with the warning condition, remove all related (system) tests, and remove it from documentation
64 |
65 | * When upgrading the Kafka CR next time to `v1beta2` or to `v1`, remove the `Kafka.spec.topicOperator` and remove the warning condition.
66 |
--------------------------------------------------------------------------------
/004-github-repository-restructuring.md:
--------------------------------------------------------------------------------
1 | # GitHub repository restructuring
2 |
3 | Currently, Strimzi has several Github repositories:
4 | * The _main_ Strimzi Kafka Operators repo
5 | * The Kafka Bridge repo
6 | * The Kafka OAuth repo
7 | * The Client Examples repo
8 | * The Strimzi Admin repo for the planned Strimzi Admin API
9 | * The Strimzi lab repo with some demos
10 | * The Strimzi website repo
11 |
12 | Despite us having multiple repositories, some things which are related to Strimzi itself seem to be only in the Operator repository.
13 | For example the design proposals or the governance documents are central and should apply to whole Strimzi, but are in the operator repository only.
14 | That might not be logical since some users might use only some parts and might miss these things.
15 |
16 | ## Proposed changes
17 |
18 | ### Governance
19 |
20 | We should create a new repo called `governance`.
21 | Into this repo, we should move the `GOVERNANCE.md` document with the Strimzi project governance as well as the `MAINTAINERS` file with the list of maintainers.
22 | In all other repositories, we should have a files `GOVERNANCE.md` and `MAINTAINERS.md` which will just contain a link to the file in the `governance` repository.
23 | Apart from these two files, the `governance` repo should also contain a license file and code of conduct file.
24 |
25 | All changes to the governance and maintainers should be done through PRs in that repository.
26 | The voting should be done either on the PR or on the mailing lists.
27 |
28 | ### Design Proposals
29 |
30 | The design proposals are currently stored in the `design` folder of the `strimzi-kafka-operators` repository.
31 | Right now, the existing proposals focus on the operators.
32 | But it is expected that in the future we would have also proposals for the other components or proposals which include more than one repository.
33 |
34 | We should create a new repository called `proposals` which should be used for the proposals with changes for all our projects.
35 | The repository should contain an overview list of proposals and for easier orientation, the proposals should be numbered starting from 1.
36 |
37 | The existing proposals should be moved to the new repository and get the numbers assigned.
38 | Any proposals opened at the time of the move should be moved to the new repo as well.
39 |
40 | ### Strimzi roadmap
41 |
42 | Additionally, the Strimzi Roadmap is kept as a project in the Strimzi operators repo as well.
43 | It is central and includes the overall roadmap for all Strimzi projects as well.
44 | The roadmap should be moved to the Strimzi organization as organization wide project.
45 |
46 | ## Next steps
47 |
48 | If this proposal is approved, the steps described in the proposed changes should be done without undue delay:
49 | * Create the new repositories and move the files
50 | * Move the Roadmap GitHub project
51 | * Add the links to the governance policy and maintainers list to all other repositories.
--------------------------------------------------------------------------------
/008-tls-encrypt-the-kafka-connect-rest-api.md:
--------------------------------------------------------------------------------
1 | # TLS encrypting the Kafka Connect REST API
2 |
3 | This proposal reduces access to the Kafka Connect REST API by enabling TLS encryption on the Kafka Connect REST listener when the connectors running on the Kafka Connect cluster are managed by the `KafkaConnector` or `KafkaMirrroMaker2` operators.
4 |
5 | ## Current situation
6 |
7 | Currently, instances of Kafka Connect that are deployed by the Strimzi operators are configured with the default REST API endpoint settings.
8 | This means that the Kafka Connect REST API endpoint uses HTTP on port 8083 and that the Strimzi `KafkaConnect`, `KafkaConnector` and `KafkaMirrorMaker2` operators make unsecured REST client calls based on this default configuration.
9 |
10 | The default network policies created by the Strimzi operators restrict incoming REST API calls to only allow access from the operator pod.
11 | Users can define further network policies can be provided to override the default policy and allow wider access.
12 |
13 | ## Motivation
14 |
15 | There is currently no way to TLS encrypt the Kafka Connect REST API, leaving the Connect REST listener as perhaps the last key endpoint that cannot currently be restricted using Strimzi.
16 |
17 | ## Proposal
18 |
19 | This proposal changes the behaviour of the `KafkaMirrorMaker2` operator and the `KafkaConnect` operator.
20 | In the `KafkaConnect` operator case, the behaviour is only changed when the `strimzi.io/use-connector-resources: "true"` annotation is applied in the `KafkaConnect` CR (meaning that all connectors running on the Connect cluster are managed by the `KafkaConnector` operator).
21 |
22 | The default behaviour of the `KafkaMirrorMaker2` operator and the `KafkaConnect` operator will be to deploy Kafka Connect clusters with a single TLS encrypted REST API listener.
23 | A self-signed TLS certificate and key required to enable TLS encryption on the REST API listener are generated by the operator, stored in secrets for use by the operator clients and the Kafka Connect pods.
24 | The certificates and keys are mounted in each Kafka Connect pod, which create an SSL truststore and keystore from the certificate and key and set the following properties in the generated connect configuration to enable an HTTPS connection:
25 |
26 | ```
27 | listeners: https://:8443
28 | rest.advertised.listener: https
29 | rest.advertised.port: 8443
30 | listeners.https.ssl.client.auth: none
31 | listeners.https.ssl.truststore.location: /tmp/kafka/kafka-connect-rest.truststore.p12
32 | listeners.https.ssl.truststore.password: ***generated password***
33 | listeners.https.ssl.truststore.type: PKCS12
34 | listeners.https.ssl.keystore.location: /tmp/kafka/kafka-connect-rest.keystore.p12
35 | listeners.https.ssl.keystore.password: ***generated password***
36 | listeners.https.ssl.keystore.type: PKCS12
37 | ```
38 |
39 | To communicate with the encrypted Connect REST API listener, the `KafkaMirrorMaker2`, `KafkaConnect` and `KafkaConnector` operators require modifications to the `AbstractConnectOperator` to apply the TLS certs as truststore options on the Vertx WebClient.
40 |
41 | ### Enabling a plain HTTP REST API listener
42 |
43 | A new field is added to the `KafkaConnect` and `KafkaMirrorMaker2` specs named `restListeners`, which allows users to configure an additional unencrypted HTTP listener on port 8083 in the Kafka Connect configuration.
44 | This enables users to access the REST API without TLS if required.
45 | The `KafkaConnector`, `KafkaConnect` and `KafkaMirrorMaker2` operators will continue to use the TLS encrypted listener when an unencrypted HTTP listener is enabled.
46 |
47 | The `restListeners` field is designed for potential future extension, and is a list of items that contain a `protocol` field.
48 | For this proposal, the `protocol` field only supports the value `http`. One (or more) items in the `restListeners` list will add the unencrypted HTTP listener on port 8083.
49 |
50 | For example, the following CR will enable the HTTP listener on port 8083:
51 | ```
52 | apiVersion: kafka.strimzi.io/v1beta1
53 | kind: KafkaConnect
54 | spec:
55 | restListeners:
56 | - protocol: http
57 | ...
58 | ```
59 |
60 | The schema for `restListeners` is as follows:
61 | ```
62 | openAPIV3Schema:
63 | type: object
64 | properties:
65 | spec:
66 | type: object
67 | properties:
68 | restListeners:
69 | type: array
70 | items:
71 | type: object
72 | properties:
73 | protocol:
74 | type: string
75 | enum:
76 | - http
77 | description: The protocol of the REST listener.
78 | required:
79 | - protocol
80 | description: List of additional REST listeners.
81 | ...
82 | ```
83 |
84 |
85 | ### Certificate renewals
86 |
87 | The self-signed TLS certificate will have a 1-year expiration.
88 | Thirty days before the self-signed TLS certificate expires, the operator will automatically renew the certificate, replace the old self-signed certificate, and restart the pods in a controlled manner to ensure the new certificates are in use by Kafka Connect without losing availability.
89 |
90 | This would need to be a multi-phase process consisting of the following steps:
91 |
92 | 1. Generate new certificate
93 | 2. Distribute the new certificate to all pods and cluster operator truststores
94 | 3. Replace the key and roll all Connect pods
95 | 4. When the old certificate expires, remove it from the truststores and roll all Connect pods
96 |
97 |
98 | ## Compatibility
99 |
100 | Switching on TLS encryption for the Connect REST API does not directly affect users of Strimzi custom resources, but given that the Connect REST API is a well known Kafka interface, there may be Strimzi users who currently access the REST API directly, for example with tooling or for monitoring.
101 | TLS encryption would mean users could no longer access the REST API as they did before. However, this can be overridden using the `spec.restListeners` field if desired.
102 |
103 |
104 | ## Rejected Alternatives
105 |
106 | ### Annotation alternative to enable a plain HTTP REST API listener
107 |
108 | Alternatively, a new annotation `strimzi.io/enable-plain-rest-listener` could be added, to avoid changing the spec.
109 | Setting the `strimzi.io/enable-plain-rest-listener` annotation value to `true` would add the additional unencrypted HTTP listener on port 8083.
110 | This was rejected as it is generally better to have configuration in the `spec`, and the proposed `restListeners` structure could be extended in future enhancements.
111 |
--------------------------------------------------------------------------------
/013-kafka-canary.md:
--------------------------------------------------------------------------------
1 | # Strimzi Canary
2 |
3 | Implement a Kafka canary tool which will act as an indicator of whether Kafka clusters are operating correctly.
4 | This would be achieved by creating a canary topic and periodically producing and consuming events on the topic.
5 |
6 | ## Current situation
7 |
8 | Currently Strimzi does not have any canary style feature.
9 | It does have health checks for the Kafka cluster which can be replaced/augmented with the canary.
10 |
11 | ## Motivation
12 |
13 | The canary would provide the following in indication that the Kafka cluster is operating as expected from a users perspective. I.e. messages can successfully be produced and consumed.
14 | Some would be exported to reflect the canary's activity. Initial metrics would included:
15 | * records-produced-total
16 | * records-consumed-total
17 | * produce-error
18 | * consume-error
19 | * latency
20 |
21 | In future the canary can be used by the Kafka Roller to improve health checks.
22 | However this is beyond the scope of the current proposed work.
23 |
24 | ## Proposal
25 |
26 | The canary would be run as a separate pod alongside the Kafka cluster.
27 | Once the canary topic is created messages will be produced and consumed from it.
28 | Messages would be consumed at a rate not faster than 1 per second and possibly as slow as 10 per second. This rate may be configurable.
29 |
30 | The canary will be built using Golang and metrics will be exposed in Prometheus format though a REST API.
31 | Currently Sarama is being considered as the client library.
32 |
33 | The current plan is to deploy the canary independently alongside the Kafka cluster.
34 | However, in future, consideration should be given to integrating the canary with Strimzi in a manner similar to the Kafka Exporter or Cruise Cotnrol. I.e. the canary would be specified (optionally) in the Kafka custom resource and then deployed by the Strimzi operator.
35 |
36 | ### Topic Configuration
37 |
38 | The topic will be configured to have a partition on each broker node and a replication factor which will be the minimum of _number of Kafka broker nodes_ and _3 (which is the most commonly used value)_.
39 | The minimum in-sync replicas will be either 1 in case replication factor is 1 (there is only one broker in the cluster) or one less than the number of replicas.
40 | The configuration should be something like the following:
41 |
42 | * Partitions = N (where N is the number of brokers)
43 | * Replication Factor = Min(number-of-brokers, 3)
44 | * Min ISR = Max(1, Replication Factor - 1)
45 |
46 | About the storage sizing, the topic should use a smaller segment size and retention (in bytes) in order to avoid to fill the disk space.
47 |
48 | #### Pros
49 |
50 | * Allows for a broker to be down, therefore accommodating rolling updates.
51 | * Mimics the availability that a user would expect.
52 |
53 | #### Cons
54 |
55 | * Unsure what broker is down. This means if the tool indicated that a broker is down for a period of time, we are unsure if it is the same broker (indicating a problem) or a different broker as rolling update occurs (not a problem).
56 |
57 | #### Considerations for this approach
58 |
59 | The canary tool must ensure when creating the topic that the partitions and replicas are spread across all brokers. E.g. If there are 10 brokers (meaning 10 partitions) and 3 replicas, all replicas must not be on only 3 brokers.
60 |
61 | How the partitions are handled when the number of brokers scales must also be considered.
62 | When scaling up, partitions must be added on the new additional brokers so that each of them is leader for one new partition
63 |
64 | However, when scaling down, partitions cannot be removed (the topic would have to be deleted and recreated).
65 | The solution to this would be to allow ‘orphaned’ partitions to remain.
66 | Their leaders will be elected on some of the remaining brokers and the canary tool will not produce to them.
67 | However, if the number of brokers scales up again, new leader elections (preferred leader election) must take place to put the partition leaders on the new brokers.
68 |
69 | ### Message scheme
70 |
71 | The messages exchanged between producer and consumer running in the canary tool will use a well defined scheme based on JSON payload.
72 |
73 | The message `key` could be null because the producer is going to specify the partition to which to send the message.
74 | The message payload will be in JSON format carrying the following fields:
75 |
76 | * `producerId`: producer identifier to enable running different producers in the tool if needed. For example, for covering different scenarios with more than one producer sending messages directly to the Kafka cluster from inside Kubernetes or going through Ingresses reaching the cluster from outside; this could leade to have different meaningful metrics values.
77 | * `messageId`: identifier of the message sent in order to correlate the right one on the consumer side to evaluate metrics (i.e. latency).
78 | * `timestamp`: time in milliseconds when the message was sent used to evaluate metrics (i.e. latency)
79 |
80 | ## Affected/not affected projects
81 |
82 | The creation of the canary topic would affect the topic operator to create the corresponding KafkaTopic resource; we should avoid to do that.
83 | The topic operator should be modified in order to filter out the canary topic.
84 |
85 | ## Compatibility
86 |
87 | The canary does not impact any existing functionality.
88 |
89 | ## Rejected alternatives
90 |
91 | The option to build the canary tool directly into the Strimzi operator was considered.
92 | However, as the canary would be optional it was decided it is better to have it deployed as it's own pod.
93 |
94 | Java has been suggested as a language for the tool as it is the language of the Strimzi operator.
95 | However, as the tool will run in a separate pod, using Java would mean another JVM , increasing the resource usage of the tool.
96 |
97 | Alternative configurations for the topic were also explored.
98 |
99 | ### Option 1: Partition over replicas
100 |
101 | Create a topic with 1 partition per broker node and a replication factor of zero.
102 | The canary would produce/consume from each partition.
103 |
104 | #### Pros
105 |
106 | * Errors will indicate that a specific broker is unavailable.
107 |
108 | #### Cons
109 |
110 | * Brokers are expected to be unavailable during rolling updates after cluster configuration changes.
111 |
112 | ### Option 2: Replicas over partition
113 |
114 | Create a topic with a single partition and a replication factor equal to the number of brokers and a minimum in-sync replicas of one less than the number of brokers.
115 |
116 | #### Pros
117 |
118 | * Allows for one (and only one) broker to be down, therefore accommodating rolling updates.
119 | * Mimics the availability that a user would expect.
120 | * Ensure that all brokers, minus one which can be down, are working as they need to be in-sync
121 |
122 | #### Cons
123 |
124 | * Unsure what broker is down. This means if the tool indicated that a broker is down for a period of time, we are unsure if it is the same broker (indicating a problem) or a different broker as rolling update occurs (not a problem).
125 |
126 |
--------------------------------------------------------------------------------
/014-move-docker-images-to-quay.io.md:
--------------------------------------------------------------------------------
1 | # Move Container Images to Quay.io
2 |
3 | Docker is introducing from November 2020 service limits for container images stored in Docker Hub.
4 | Free accounts will have following rate limits:
5 | * 100 pulls for anonymous users per 6 hours
6 | * 200 pulls for authenticated users (on the free plan) per 6 hours
7 |
8 | In addition, it is also planned that images without any activity (image push or pull) will be kept only for 6 months (this might not start to apply already from November but only later).
9 | For more details, see [Docker Hub Pricing & Subscriptions](https://www.docker.com/pricing).
10 |
11 | ## Motivation
12 |
13 | The limits might impact Strimzi on several levels:
14 | * Developers developing Strimzi who might need to pull the images often for development and test purposes
15 | * CIs running under different accounts
16 | * Users who might want to pull the images
17 |
18 | The image expiration policy would also remove the images from our Docker Hub account and make them not available to users using older versions.
19 |
20 | ## Proposal
21 |
22 | This proposal suggests to:
23 | * Start using [Quay.io](https://quay.io/) as our new container registry for all master builds (`:latest` images).
24 | * Start using [Quay.io](https://quay.io/) for releases starting with Strimzi Kafka Operators 0.21.0 and Strimzi Kafka Bridge 0.20.0 releases.
25 | * Move client-examples and any UI related images to as well.
26 |
27 | Additionally, we should make a copy of previous releases to [Quay.io](https://quay.io/) as well in order to:
28 | * Make sure the releases are not lost.
29 | * If needed, users can manually change their installation files for operator releases 0.20.0 and earlier and bridge releases 0.19.0 and earlier to use [Quay.io](https://quay.io/) as well.
30 |
31 | Our container image structure (names, number of images etc.) changed significantly in Strimzi 0.12.0 release.
32 | So the copying to [Quay.io](https://quay.io/) should be done for all images for release 0.12.0 and later.
33 | This will also help to avoid confusion among users who find the old images not used anymore and believe that they are the latest versions.
34 | The copying should be done only for releases - release candidates or the `latest` images will not be copied (the `latest` images should be pushed from new builds instead).
35 |
36 | [Quay.io](https://quay.io/) also offers some more advanced features such as robot (service) accounts for easier CI integration or security scanning.
37 |
38 | ## Rejected alternatives
39 |
40 | There are several available container repositories I'm aware of and which I considered:
41 | * Google Cloud and Amazon AWS registries are bound to an account and are not for free (AFAIK you pay for used storage and data transfers). So we would need to organize a shared Strimzi account and make sure the costs are covered.
42 | * GitHub container registry is currently available only as a beta. Any future pricing and availability is not clear.
43 | * Docker Hub offers a program for Open Source projects. But it is not even clear what does this program offer for accepted open source projects - it exists just as an application form. So this does not seem to be transparent.
44 |
--------------------------------------------------------------------------------
/016-modularizing-strimzi-ui.md:
--------------------------------------------------------------------------------
1 |
2 | # Modularizing Strimzi UI
3 |
4 | The following proposal is a recommendation that we move from one monolithic web client to a web client that is made up of smaller packages.
5 |
6 | ## Current situation
7 |
8 | Currently the [Strimzi UI](./011-strimzi-ui.md) is being developed as one monolithic application for the web client and web client server. In order to help isolate code into manageable packages it is recommended that the project moves to a modularized architecture.
9 |
10 | In the proposed architecture change the web client will be split up into smaller packages. This architecture will help clearly define functional areas and help with separation of concerns within the web client and the web client server. We will continue to have one single source of truth by using the current repo, however with smaller packages it will allow the multiple web clients (e.g. Carbon web client, PatternFly web client, etc.) to pull in only the packages that are needed to build that client.
11 |
12 | ## Motivation
13 |
14 | By implementing this proposal the project will gain the following benefits:
15 |
16 | * **Simplified dependency management for view layers:** Each of the view layers that make up the web clients, can pull in the functionality that the view needs by including only the packages the are required for that view.
17 |
18 | * **Focus on immediate needs for each web client:** To build each of the different web clients, currently the views that are displayed to the user are swapped out at build time. This is done via webpack normalization. In order to support this at minimum skelton code for each of the views need to be added in order for each client to build. Short term this can result in extra code being generated and additional rework in the future based on each of the web clients needs for their consumers. In order to reduce this rework and allow each web client to develop views as needed, it is recommended that view layer be stored in a separate package (e.g. ui-carbon, ui-patternfly).
19 |
20 | * **Separation of concerns:** Each of the packages will focus on a specific area of concern. This will help in code clarity as well as reusability.
21 |
22 | * **Simplify build process:** By introducing smaller packages it will reduce our dependency on custom webpack and build scripts. This will make it easier for contributors to understand how the project is built.
23 |
24 | ## Proposal
25 |
26 | ### Tools
27 | In order to support a modular architecture the following tools will be introduced:
28 |
29 | - [`Lerna`](https://github.com/lerna/lerna) We are using Lerna to support the development of each package and manage all the packages that make up this project.
30 |
31 | - [`Yarn Package Manager`](https://yarnpkg.com/) Yarn package manager allows us to manage which dependencies get install to make sure our software is rebuilt in a consistent stable manner. We are also using workspaces provided by yarn with lerna to build each of the packages.
32 |
33 | ### Package structure
34 |
35 | The following is the proposed package structure for the mono repo:
36 |
37 |
38 | | Package | Description | npm package |
39 | | ------- | ----------- | ----------- |
40 | | models | Contains the data models that will be used by the server and client | @strimzi-ui/models |
41 | | services | Contains the business logic for the ui. This acts the controller in the MVC model. | @strimzi-ui/services
42 | | ui-(Web-Client-Framework) | Packages prefixed with ui in front act as the view layer. There will be multiple view layers for each web client that is being developed for this project. (e.g. ui-carbon, ui-patternfly, etc.) | @strimzi-ui/ui-patternfly, @strimzi-ui/ui-carbon
43 | | ui-server | This package contains the web client server responsible for serving the ui and proxying requests. | @strimzi/ui-server
44 |
45 |
46 | ## Affected/not affected projects
47 |
48 | This will affect the strimzi-ui project.
49 |
50 | ## Compatibility
51 |
52 | Not applicable
53 |
54 | ## Rejected alternatives
55 |
56 | Not applicable
57 |
--------------------------------------------------------------------------------
/018-rest-admin-api.md:
--------------------------------------------------------------------------------
1 | # Use the admin-server REST API in strimzi-ui
2 |
3 | The [admin-server](012-admin-server.md) is designed to support a REST API and a GraphQL API. The
4 | [strimzi-ui](011-strimzi-ui.md) is currently designed to use the GraphQL API.
5 |
6 | Having started using the GraphQL API it is clear that the entities and operations that the UI requires are better
7 | suited to a REST API. This proposal updates the [strimzi-ui](011-strimzi-ui.md) design to use the REST API exposed by
8 | the [admin-server](012-admin-server.md).
9 |
10 | ## Current situation
11 |
12 | Currently [strimzi-ui](011-strimzi-ui.md) is using the GraphQL API from the [admin-server](012-admin-server.md).
13 |
14 | ## Motivation
15 |
16 | GraphQL provides a number of benefits for APIs including:
17 |
18 | * a way for clients to follow references between entities in a single request
19 | * a way for clients to describe the data they want, and get returned only that data
20 | * a typed schema (with the ability to define new types)
21 | * built in support for schemas
22 |
23 | It also brings a number of challenges:
24 |
25 | * support for API versions is not so well defined as REST
26 | * federating APIs is much harder and introduces a performance hit
27 | * complexity both in the server code and in the client code
28 | * whilst querying data is simple, mutating data is not as simple when using GraphQL
29 |
30 | Applying this to Strimzi we feel that the disadvantages of using GraphQL outweigh the benefits:
31 |
32 | * the data exposed does not have complex bi-directional relationships - the number of entities is small and entities do
33 | not have a lot of references to another entity - this removes much of the benefit of GraphQL
34 | * typically the UI requires the entire entity (there is not a lot of "optional" data)
35 | * strimzi-ui must federate APIs (across Kafka instances and across Kafka and configuration) - federating this using REST
36 | is simple but complex with GraphQL
37 | * the Strimzi Bridge already exposes a REST API, so using REST fits better with the strimzi ecosystem
38 | * as the surface area of the API is small, the code complexity of using GraphQL outweighs the benefits seen for a large
39 | API (e.g. via built in schema support)
40 | * supporting versioned APIs will be beneficial for the API as it can allow multiple versions of strimzi to be used with
41 | a single UI
42 |
43 | ## Proposal
44 |
45 | ### Prioritize development of the strimzi-admin REST support
46 |
47 | The strimzi-admin project currently intends to support both REST and GraphQL. If this proposal is adopted then the
48 | GraphQL support will be dropped and the REST support continued.
49 |
50 | ### Adjust strimzi-ui to use REST API
51 |
52 | The strimzi-ui project will be adjusted to use the REST API:
53 |
54 | * the entity model will be generated from the OpenAPI REST API schema
55 | * the introspection features currently defined for the strimzi-ui project will be replaced by metadata APIs.
56 |
57 | ## Affected/not affected projects
58 |
59 | The strimzi-ui project is affected (as outline in the Proposal). Other projects are not.
60 |
61 | ## Compatibility
62 |
63 | There are no compatibility issues, as the API is still under development.
64 |
65 | ## Rejected alternatives
66 |
67 | There were no rejected alternatives.
68 |
--------------------------------------------------------------------------------
/019-restruture-the-installation-files.md:
--------------------------------------------------------------------------------
1 | # Restructure the installation files
2 |
3 | This proposal suggests restructuring to the folders of the [`strimzi-kafka-operator` GitHub repository](https://github.com/strimzi/strimzi-kafka-operator).
4 |
5 | ## Current situation
6 |
7 | Currently, the Strimzi installation files in our master branch correspond to the images built from the master branch.
8 | That means they are under constant development and often work only with the freshly pulled images.
9 | That suits well for development work, but not necessarily for the users.
10 | They often checkout the GitHub repository, see the install or example folders and try them, not realizing that they're using `master` rather than a stable release.
11 | If they just use them for a short time it should not matter.
12 | But if they keep using them, they will run into issues later.
13 |
14 | ## Motivation
15 |
16 | Improve the experience of the users who checkout Strimzi and want to install / try it.
17 |
18 | ## Proposal
19 |
20 | We should change the way we manage these files.
21 | This proposal suggests following:
22 | * Create a new subdirectory `packaging`
23 | * This subdirectory will contain the _in-development_ versions of `install`, `examples` and `helm-chart` directories.
24 | * Any changes in regular PRs, generated CRD files etc. will be done in this directory
25 | * New releases will use these files for packaging
26 | * The release artifacts will remain unchanged, they will just be built from this directory and not from the original ones
27 | * The original `install`, `examples` and `helm-chart` directories will be kept, but will contain the files for the latest stable release
28 | * Updates to them will be done only during / after release
29 | * They will not be changed with regular PRs
30 | * A README.md file will be aded to them to clarify they contain the latest stable release and should not be changed as part of development but only when new versions are released
31 |
32 | ### Risks
33 |
34 | The proposal keeps existing directories with more or less the same files but changes their purpose.
35 | That can be confusing for people using them today for the right purpose.
36 | It might be also confusing for new contributors who might try to change them in their PRs
37 |
38 | ## Affected/not affected projects
39 |
40 | This proposal affects only the [`strimzi-kafka-operator` GitHub repository](https://github.com/strimzi/strimzi-kafka-operator).
41 |
42 | ## Compatibility
43 |
44 | Call out any future or backwards compatibility considerations this proposal has accounted for.
45 |
46 | ## Rejected alternatives
47 |
48 | Following options were considered but I decided against them at the end:
49 |
50 | ### Moving the released files into separate directory
51 |
52 | * A new directory `deploy` will be created
53 | * This directory will have its own versions of the `install`, `examples` and `helm-chart` directories
54 | * Their contain will correspond to the latest stable release
55 | * They will be updated only when a new release happens, but not with regular PRs
56 | * `deploy` directory is common also on some other projects, so people should understand its purpose
57 |
58 | The disadvantage would be that old links would stop working and people might be confused.
59 | On the other hand, it would make life easier for people used to use the current directories for development since they would not find them instead of installing the last release by mistake.
60 |
61 | ### Removing the released Helm Chart
62 |
63 | The released Helm Charts are on the website and can be pulled from there.
64 | I'm not sure how often the latest Helm Chart is really installed from file directly.
65 | but we anyway generate it during the release.
66 | So having it there should not add too much effort.
67 |
--------------------------------------------------------------------------------
/020-rename-default-branch-of-strimzi-github-repositories.md:
--------------------------------------------------------------------------------
1 | # Rename the default branch of Strimzi GitHub repositories
2 |
3 | CNCF is committed to use inclusive naming.
4 | As a CNCF project, we should try to comply with the suggestions and recommendations.
5 | One of these is naming of the default branch of our GitHub repositories.
6 |
7 | ## Current situation
8 |
9 | Currently, all our GitHub repositories use `master` branch as the default branch.
10 | This was the default name used by GitHub in the past as well as the name used by most projects.
11 |
12 | ## Motivation
13 |
14 | The main motivation is following the recommendations of the [Inclusive Naming Initiative (INI)](https://inclusivenaming.org/).
15 | Members of INI include CNCF as well as many other organizations and companies (including employers of many Strimzi maintainers, committers and contributors).
16 | Please read the [INI Word replacement list](https://inclusivenaming.org/language/word-list/) for the reasoning why the name `master` might not be recommended.
17 |
18 | ## Proposal
19 |
20 | We should change the name of the default branch from `master` to `main`.
21 | `main` is the new default name used by GitHub for newly created repositories.
22 | It is also one of the recommendations from the [INI Word replacement list](https://inclusivenaming.org/language/word-list/).
23 |
24 | GitHub tries to make renaming the default branch as easy as possible.
25 | Renaming the branch should automatically update all PRs, branch protection rules, redirect links and more.
26 | For more details about renaming he default branch, see [Renaming the default branch from `master`](https://github.com/github/renaming).
27 |
28 | ### Risks
29 |
30 | Despite GitHub trying to make it as easy as possible, renaming the default branch will still cause some disruptions:
31 |
32 | * Users / Developers will need to get used to the new default branch name (for example when using commonly used commands such as `git rebase master` etc.)
33 | * Users / Developers will need to update their local repositories to use the new branch
34 |
35 | ## Affected / not affected projects
36 |
37 | This proposal affects all Strimzi GitHub repositories.
38 |
39 | ## Compatibility
40 |
41 | Users will need to switch to the new default branch.
42 | But no compatibility issues should be causes to the actual Strimzi applications.
43 |
44 | ## Rejected alternatives
45 |
46 | This proposal follows the INI recommendations.
47 | There are currently no rejected alternatives.
48 | We could consider also using some other name for the default branch than `main`, but there do not seem to be any reasons to not follow GitHub's new default name.
49 |
--------------------------------------------------------------------------------
/021-special-repository-for-st-clients-based-on-example-clients.md:
--------------------------------------------------------------------------------
1 | # Special repository for ST clients based on example clients
2 |
3 | This proposal suggests creating a new repository for `Strimzi` ST client based on [Strimzi client-examples](https://github.com/strimzi/client-examples).
4 |
5 | ## Current situation
6 |
7 | Currently, we are using two clients in our STs:
8 | - `InternalKafkaClients` (based on `test-client` image)
9 | - `example clients` (from [Strimzi client-examples](https://github.com/strimzi/client-examples)).
10 |
11 | The plan is to remove the `InternalKafkaClients` and keep only `example clients`.
12 | The `test-client` is not sufficient anymore, it can create single producer/consumer, send/receive messages, and then we can assert result.
13 | We are stuck here, and we need to wait until a producer/consumer is finished.
14 | With `example clients` we are able to do a lot more.
15 | For example - create a continuous job for sending messages with delay,
16 | _stack_ the producers to create a _traffic_, add extra configuration, use different types of producer/consumer (for Bridge, Kafka, ...) and many more.
17 |
18 | ## Motivation
19 |
20 | While testing `Strimzi` we need, in some cases, special configuration of clients, which is not implemented in the `client-examples`.
21 | The `client-examples` repository should be really _exemplary_,
22 | and we should not add any extra _configuration_ or _extensions_ to it.
23 | For this kind of enhancements we should have repository, which will have `client-examples` as base,
24 | and we will be able to add special setting without disrupting the basic idea of example clients.
25 |
26 | ## Proposal
27 |
28 | * Create a new repository for `systemtest client`
29 | * name will be `test-clients`
30 | * component owners will be same as for STs
31 | * PR checks:
32 | * DCO
33 | * build - `mvn` build, checkstyle (with some simple UTs or ITs)
34 | * complex implementation of clients for testing
35 | * we'll use both Kafka and Bridge clients from `client-examples`
36 | * will be based on [Strimzi client-examples](https://github.com/strimzi/client-examples) - we'll copy the
37 | `client-examples` code and then modify it - each repo will then _go their own way_
38 | * we'll be able to modify it with our special configuration
39 | * the main idea of example clients remain intact
40 |
41 | * The original `client-examples` repository will be kept
42 |
43 | ## Advantages
44 |
45 | There are many things we can implement.
46 | Good example is returning exceptions and return codes into the `job` status (as we are using `k8s` jobs for deploying the example clients) and asserting it in tests.
47 | Currently, we have to grep exceptions from the job log - which can be a problem.
48 |
49 | ## Images and releases
50 |
51 | The images will be built as in `client-examples` after each merged PR and pushed to `strimzi` repository on `quay.io` with `latest` tag.
52 | The `systemtest client` will have its own release cadence and versioning, which will depend on new features.
53 |
54 | ## Kafka version
55 |
56 | The `systemtest client` will support latest released version of Kafka with several older.
57 | We'll start with version `2.5.0`, as it's the oldest supported version by latest released version of Strimzi (0.22.1),
58 | and we'll support it until `test-clients` features will allow us to use that specific version or until we will decide to completely deprecate it.
59 | For each supported Kafka version will be separate tag - pattern will be `test-client/TEST_CLIENTS_VERSION-kafka-KAFKA_VERSION`.
60 |
61 | ## Implementation
62 |
63 | Client will be implemented in Java, same as `client-examples`.
64 |
65 | ## Affected/not affected projects
66 |
67 | Only `systemtest` part of the `Strimzi` will be affected.
68 |
69 | ## Rejected alternatives
70 |
71 | There are no rejected alternatives at the moment.
--------------------------------------------------------------------------------
/023-using-ubi8-as-base-image.md:
--------------------------------------------------------------------------------
1 | # Using Red Hat Universal Base Image 8 as the new Strimzi base image
2 |
3 | This proposal suggests to use Red Hat UBI8 as the new Strimzi base image.
4 |
5 | ## Current situation
6 |
7 | Strimzi container images are currently based on the CentOS 7 container images.
8 | The CentOS 7 container images are stored on Docker Hub and used to build all Strimzi images.
9 |
10 | ## Motivation
11 |
12 | Using the CentOS 7 image has three main disadvantages:
13 | * It is hosted on Docker Hub which has limited number of anonymous pulls (see [proposal 14](https://github.com/strimzi/proposals/blob/main/014-move-docker-images-to-quay.io.md) for more details).
14 | * CentOS 7 images are not released too often (at the time of writing, the image is 2 months old), so we need to install lot of updates while building the images
15 | * CentOS 7 received CVE fixes in batches often onůy after they are available in other images
16 |
17 | _Note: There has been recently many discussion about changes in the CentOS project.
18 | But to my knowledge, they affect only the CentOS 8 version.
19 | CentOS Streams and CentOS 7 remain unchanged.
20 | CentOS 7 should receive maintenance updates until the year 2024.
21 | So there is no time pressure to move to other base image caused by these changes._
22 |
23 | ## Proposal
24 |
25 | Strimzi should move to use Red Hat Universal Base Image 8 (UBI 8) as a base image.
26 | UBI8 is based on Red Hat Enterprise Linux 8 and is available to everyone for free (it does not include any Red Hat support).
27 | It can be pulled without any registration from `registry.access.redhat.com/ubi8/ubi-minimal:latest` and be redistributed (i.e. we can push images build on this base image into our repositories).
28 | The details can be found in the [Red Hat UBI EULA](https://www.redhat.com/licenses/EULA_Red_Hat_Universal_Base_Image_English_20190422.pdf) and in the related [FAQ](https://developers.redhat.com/articles/ubi-faq#).
29 |
30 | UBI8 receives normally updates earlier than CentOS 7.
31 | New versions of the image are also released more often.
32 | And since it is not hosted on Docker Hub, it is not subject to Docker Hub pull limits.
33 |
34 | There are several versions of the UBI8 image.
35 | The `minimal` is the smallest and should be used by Strimzi.
36 |
37 | ### Affected repositories
38 |
39 | This proposal covers the container images from the following repositories:
40 | * `strimzi-kafka-operator`
41 | * `strimzi-kafka-bridge`
42 | * `client-examples`
43 | * `test-clients`
44 | * `strimzi-canary`
45 |
46 | ### Implementation
47 |
48 | If approved, the Dockerfile files in the affected Strimzi projects will be updated to use the `ubi-minimal` image.
49 |
50 | ## Compatibility
51 |
52 | There are no compatibility issues.
53 |
54 | ## Rejected alternatives
55 |
56 | For this proposal, I considered mainly the Red Hat and CentOS images.
57 | Unlike images for other Linux distributions, they are very close to the current image.
58 | So moving to the new base image should mean minimal disruption and not too many changes.
59 |
--------------------------------------------------------------------------------
/024-adopt-the-kafka-static-quota-plugin.md:
--------------------------------------------------------------------------------
1 | # Adopt the Kafka Static Quota plugin
2 |
3 | ## Current situation
4 |
5 | One of the common problems when running Apache Kafka is running out of disk space.
6 | Strimzi currently does not offer any protection against this.
7 | Once the disk gets full, usually the only way how to recover from full disk is to either delete some files (log segments) or to increase the disk size.
8 |
9 | ## Motivation
10 |
11 | Having some solution which would protect from the disks getting full would be a good improvement for Strimzi users.
12 |
13 | ## Proposal
14 |
15 | Strimzi should adopt the [Kafka Static Quota plugin](https://github.com/lulf/kafka-static-quota-plugin).
16 |
17 | ### Quota types
18 |
19 | It provides two types of quotas:
20 |
21 | #### Per-broker produce and fetch quotas
22 |
23 | Apache Kafka itself offers only quotas per client / user.
24 | The Kafka Static Quota plugin allows configuring of an overall produce and/or fetch quota per-broker.
25 | The quota will be distributed between the clients connected to the broker.
26 | The produce / fetch quotas currently don't support per-listener quotas.
27 | There is always only one quota for all listeners.
28 | The replication traffic is not counted into the quota.
29 |
30 | #### Storage quotas
31 |
32 | The Kafka Static Quota plugin allows users to configure two storage limits: soft and hard.
33 | After the soft limit is breached, it will start throttling the producers.
34 | The allowed throughput is linearly decreased until the hard limit is reached.
35 | Once the hard limit is reached, the allowed produce throughput will be 0.
36 | The storage quotas currently don't support JBOD storage, only brokers with single log directory.
37 |
38 | ### Plugin adoption
39 |
40 | In order to include the plugin into Strimzi images, we would need it to be available in Maven repositories.
41 | This proposal suggests to fork the plugin under Strimzi and continue its development.
42 | The name repository should be named `kafka-quotas-plugin`.
43 | We can set up Azure Pipelines build for it and push it to Maven repositories.
44 |
45 | We will add the plugin to our Kafka container images via the third-part libraries mechanism.
46 | At this time, we should not add any integration into the Strimzi Kafka CRD.
47 | The plugin and its quotas can be configured through `.spec.kafka.config`.
48 | But we should document it and include it in our system tests.
49 |
50 | We should also work on additional features.
51 | For example:
52 |
53 | * Support for JBOD storage
54 | * Support for per-listener configuration
55 |
56 | ## Risks
57 |
58 | The quota plugin will not be enabled by default, so there is minimal risk of it causing any problems.
59 |
60 | ## Affected / not affected projects
61 |
62 | This proposal affects only the plugin and the operators repository where it will be added to the Apache Kafka images..
63 |
64 | ## Compatibility
65 |
66 | The quota plugin will not be enabled by default, so there should be no impact on backwards compatibility.
67 |
--------------------------------------------------------------------------------
/025-control-plain-listener.md:
--------------------------------------------------------------------------------
1 | # Control Plane Listener
2 |
3 | This proposal suggests to use separate Control Plane listener and the plan how it should be introduced.
4 |
5 | ## Current situation
6 |
7 | Strimzi currently has one internal listener.
8 | It is used by the Kafka brokers for data replication but also for coordination between the controller and the other brokers.
9 | And in addition to that, it is also used by the different Strimzi components:
10 | * Operators
11 | * Kafka Exporter
12 | * Cruise Control
13 |
14 | ## Motivation
15 |
16 | Kafka already for some time supports using separate listeners for data replication and coordination.
17 | It was implemented as part of [KIP-291](https://cwiki.apache.org/confluence/display/KAFKA/KIP-291%3A+Separating+controller+connections+and+requests+from+the+data+plane).
18 | The main reason for using separate listeners is that the replication traffic which is very data-intensive does not increase the latency the coordination traffic when separate listeners are used.
19 | Strimzi should make use of it and have separate listeners for the different tasks.
20 |
21 | ## Proposal
22 |
23 | Introducing the separate control plain listener has impact on backwards compatibility and on upgrade / downgrade procedures.
24 | During the upgrade or downgrade, different brokers which are part of the same cluster would have a different configuration.
25 | Some of them will try to coordinate using the dedicated control plane listener while others will try to coordinate using the replication listener.
26 | Because of this, the different members of the cluster will not be able to communicate and the cluster will not be available to clients during the upgrade or downgrade.
27 | This can be easily handled while upgrading.
28 | But it is not possible to handle it when downgrading as the version the user would be downgrading from will not have a chance to revert the control plane listener changes.
29 |
30 | To mitigate this, this proposal suggests to use [Feature Gates](https://github.com/strimzi/proposals/blob/main/022-feature-gates.md) to introduce the control plane listener over multiple releases and cause minimal disruption for the users.
31 | Strimzi 0.23 will add new internal listener using port 9090.
32 | This listener will be enabled by default and configured in the same way as the replication listener.
33 | But it will not be configured as the control plane listener.
34 |
35 | A new feature gate `ControlPlaneListener` will be added and disabled by default.
36 | When enabled, this feature gate will configure the Kafka brokers to use the new listener for the control plane communication.
37 | Replication, Strimzi operators, and other components such as Cruise Control or Kafka Exporter will keep using the existing replication listener.
38 |
39 | Since it will be disabled by default at first, it will have no impact on backwards compatibility:
40 | * Upgrades from earlier versions will add the new listener.
41 | But it will not be used for anything yet and will not break the Kafka cluster.
42 | * Downgrades will remove the listener, but that will not cause any issues because it will not be used for anything.
43 | * Users who would want to try it or use it already during the early releases will be able to upgrade to the new Strimzi version supporting it and enable the feature gate.
44 | That will configure the Kafka broker to use the new listener.
45 | They will be also able to disable it again later.
46 | In case they would need to do a downgrade to a Strimzi version not supporting the `ControlPlaneListener` feature gate, they would need to disable it first.
47 |
48 | Later - after several Strimzi releases - the `ControlPlaneListener` feature gate will be changed to enabled by default.
49 | By that time, most users upgrading from the Strimzi versions which already support this feature gate will already have the new listener on port 9090 enabled.
50 | So the upgrade will proceed without any complications.
51 | Similarly, users downgrading to Strimzi versions already supporting the `ControlPlaneListener` feature gate will be able to downgrade without any problems.
52 | Only users upgrading from / downgrading to Strimzi versions not supporting the `ControlPlaneListener` feature gate will need to manually disable it first in order to allow it to happen without any unavailability of the Kafka cluster.
53 |
54 | At the end, after more Strimzi release, the `ControlPlaneListener` feature gate will move to the GA phase.
55 | The control plane listener will be enabled by default and the feature gate will be removed.
56 | It will not be possible to disable it anymore.
57 | Upgrade from / downgrades to Strimzi versions not supporting the `ControlPlaneListener` will not be possible anymore.
58 |
59 | The following table shows in which Strimzi versions is the state of the feature gate expected to change.
60 | This plan is subject to change in case of any problems appear doing the different phases.
61 |
62 | | Phase | Strimzi versions | Default state |
63 | |:------|:-----------------------|:-------------------------------------------------------|
64 | | Alpha | 0.23, 0.24, 0.25, 0.26 | Disabled by default |
65 | | Beta | 0.27, 0.28, 0.29, 0.30 | Enabled by default |
66 | | GA | 0.31 and newer | Enabled by default (without possibility to disable it) |
67 |
68 | _(The actual version numbers are subject to change)_
69 |
70 | Strimzi operators and other components (Cruise Control, Kafka Exporter) will keep using the replication listener on port 9091 for communication with the Kafka brokers.
71 |
72 | ## Compatibility
73 |
74 | The introduction of this feature is designed to minimize the compatibility impacts.
75 |
76 | ## Affected components
77 |
78 | Only the Cluster Operator and Kafka brokers are impacted by this proposal.
79 | Other operators or other operands are not affected.
80 |
81 | ## Rejected alternatives
82 |
83 | Introducing this feature without the possibility to downgrade back to previous Strimzi versions was considered but rejected.
--------------------------------------------------------------------------------
/026-service-account-patching.md:
--------------------------------------------------------------------------------
1 | # Service Account patching
2 |
3 | This proposal suggests to introduce proper reconciliation of service accounts and handle them in the same way as other Kubernetes resources.
4 |
5 | ## Current situation
6 |
7 | Strimzi currently does not reconcile service accounts it creates.
8 | In the service account reconciliation loop, only creation and deletion actually does something.
9 | Patching just returns success without doing anything.
10 | This was originally done this way because the patching removed the attached secret and was causing a new authentication token being created every reconciliation.
11 |
12 | ## Motivation
13 |
14 | For many resources we create, we offer customization using the template mechanism.
15 | User can use the `template` sections in the different custom resources to customize some parts of the Kubernetes resources.
16 | In some case, it allows customizing only labels and annotations.
17 | In other there are more things to configure.
18 | One of the advantages of this model is that users can declaratively configure things in the custom resources.
19 |
20 | However, there is currently no template for service accounts.
21 | And even if we add the template, it will work only at creation time because we do not patch service accounts right now
22 | So that would create a weird situation where the template for service accounts behaves differently from other resources.
23 |
24 | At the same time, it is in many cases desired to have the labels and annotations configurable.
25 | Some platforms - such as AWS - link their internal identities to the service accounts based on annotations.
26 | For example, the annotation `eks.amazonaws.com/role-arn: arn:aws:iam:::role/` tells AWS that pods with this service account should have the role specified in the annotation (For more info, see [AWS documentation](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts-technical-overview.html)).
27 | This is especially useful for something like Kafka Connect with connectors interacting with other AWS services.
28 | The connectors can use the AWS IAM role assumed by the annotation to authenticate against AWS services.
29 |
30 | ## Proposal
31 |
32 | This proposal suggests several changes to how the Strimzi Cluster Operator handles the service accounts.
33 |
34 | 1) A new `serviceAccount` fields will be added to the `template` sections of the Strimzi CRDs which result in any service accounts being created (`Kafka`, `KafkaConnect`, `KafkaConnectS2I`, `KafkaMirrorMaker2`, `KafkaMirrorMaker` and `KafkaBridge`).
35 | It will allow customizing the labels and annotations of the service accounts in a declarative way.
36 | 2) The ServiceAccountOperator class in the `operator-common` module will be updated to patch the service accounts during reconciliation.
37 | To avoid issues with the tokens being recreated on every reconciliation, it will always copy the name of the token secret before patching (similarly as done for node ports in services etc.).
38 | That will ensure that the service accounts will not be disrupted by the patching.
39 |
40 | Starting to patch the service accounts can cause issues to existing users.
41 | Since we do not patch them today, lot of users simply annotate them manually after they are created or create the service accounts with the desired labels and annotations first before creating the Strimzi resources.
42 | If we suddenly enable patching of the service account, it might remove the labels and annotations for these users and cause problems to their running applications.
43 |
44 | To mitigate this, this proposal suggests to use [Feature Gates](https://github.com/strimzi/proposals/blob/main/022-feature-gates.md) to introduce the patching of service accounts over multiple releases and cause minimal disruption for the users.
45 | A new feature gate `ServiceAccountPatching` will be added and disabled by default at first.
46 | When disabled the operator will treat the service accounts as today and not patch them.
47 | In such case, the service account template will be used only at creation.
48 | When enabled, the operator will start patching the service accounts in every reconciliation and any changes to the service account templates will be applied.
49 |
50 | The `ServiceAccountPatching` feature gate will mature over multiple releases as described in the proposed plan below.
51 | Once it reaches GA, the feature gate will be removed and the service account patching will be enabled by default.
52 | This will bring the handling of service account in-sync with how other Kubernetes resources are handled.
53 |
54 | | Phase | Strimzi versions | Default state |
55 | |:------|:-----------------------|:-------------------------------------------------------|
56 | | Alpha | 0.24, 0.25, 0.26 | Disabled by default |
57 | | Beta | 0.27, 0.28, 0.29 | Enabled by default |
58 | | GA | 0.30 and newer | Enabled by default (without possibility to disable it) |
59 |
60 | _(The actual version numbers are subject to change)_
61 |
62 | ## Compatibility
63 |
64 | The introduction of this feature is designed to minimize the compatibility impacts.
65 |
66 | ## Affected components
67 |
68 | Only the Cluster Operator is impacted by this proposal.
69 |
70 | ## Rejected alternatives
71 |
72 | Enabling the patching of service accounts immediately without the feature gate was considered.
73 | But it was rejected because of the possible negative impact on users.
74 |
--------------------------------------------------------------------------------
/027-kubernetes-config-provider.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Configuration Provider for Apache Kafka
2 |
3 | This proposal suggests to create a configuration provider for reading data from Kubernetes Secrets and Config Maps.
4 |
5 | ## Current situation
6 |
7 | Apache Kafka supports pluggable configuration providers for loading configuration data from external sources.
8 | They can be used in the configuration of the different components.
9 | By default, Kafka provides two configuration providers:
10 |
11 | * `FileConfigProvider` for reading configuration records, as key-value pairs, from a single properties file
12 | * `DirectoryConfigProvider` for reading one or more configuration files from a specified directory (files are read from start to finish)
13 |
14 | In addition, users can implement their own configuration providers.
15 |
16 | ## Motivation
17 |
18 | Currently, Strimzi uses Kubernetes Secrets to store different types of information.
19 | For example, cluster or user certificates or user passwords.
20 | Kafka clients (or other components) running in the same namespace can mount the secrets as volumes or environment variables and use them.
21 | But this does not work for applications running in other namespaces or outside the Kubernetes cluster.
22 | They have to either copy the secrets into their namespace or extract the files to use them.
23 | Users also need to make sure that their data are kept in sync and update them when the original secrets change.
24 |
25 | Having a configuration provider which can extract these data directly from the Kubernetes API would eliminate these issues.
26 | It would allow users to load the configuration data from the Kubernetes API, even from other namespaces or outside of the Kubernetes cluster.
27 | Users would just need to use the configuration provider from their properties and have the data pulled from Kubernetes directly.
28 | Loading the configuration data directly from Kubernetes API will also solve the issues with keeping the data up-to-date since they will be loaded directly from the source.
29 |
30 | ## Proposal
31 |
32 | We should create a new configuration provider which would load data from Kubernetes Secrets and Config Maps.
33 | Secrets and Config Maps are both key-value stores.
34 | The configuration provider will get the Secret or Config Map and extract the desired keys from them.
35 |
36 | It will use the Fabric8 Kubernetes client to communicate with the Kubernetes API.
37 | There will be no special configuration for connecting to the Kubernetes cluster.
38 | It will just use the default auto-detection of Kubernetes credentials from `kubeconfig` file, mounted tokens etc.
39 | In addition, users can use the Fabric8 environment variables or Java system properties to configure the client.
40 |
41 | The configuration provider only reads the requested Config Map or Secret.
42 | The only RBAC rights it needs is the `get` access rights on a given resource.
43 | For example:
44 |
45 | ```yaml
46 | - apiGroups: [""]
47 | resources: ["secrets"]
48 | resourceNames: ["my-cluster-cluster-ca-cert"]
49 | verbs: ["get"]
50 | ```
51 |
52 | Thanks to that users should be able to give the configuration provider only very restricted access to just the selected resources.
53 |
54 | To use the configuration provider, the user will need to configure their client like this:
55 |
56 | ```properties
57 | config.providers=secrets
58 | config.providers.secrets.class=io.strimzi.kafka.KubernetesSecretConfigProvider
59 | security.protocol=SSL
60 | ssl.keystore.type=PEM
61 | ssl.keystore.certificate.chain=${secrets:myproject/my-user:user.crt}
62 | ssl.keystore.key=${secrets:myproject/my-user:user.key}
63 | ssl.truststore.type=PEM
64 | ssl.truststore.certificates=${secrets:myproject/my-cluster-cluster-ca-cert:ca.crt}
65 | ```
66 |
67 | The configuration provider should use a separate GitHub repository under the Strimzi GitHub organization.
68 | The repository should be named `kafka-kubernetes-config-provider`.
69 | It should be published to Maven Central as `io.strimzi:kafka-kubernetes-config-provider`.
70 |
71 | ## Compatibility
72 |
73 | The configuration provider should be compatible with all recent Apache Kafka versions.
74 |
75 | ## Affected components
76 |
77 | This is a separate component which should make it easier for users to use Strimzi based Apache Kafka clusters from their applications.
78 | But it has currently no direct impact on the other resources.
79 |
80 | ## Rejected alternatives
81 |
82 | No other alternatives were considered.
83 |
--------------------------------------------------------------------------------
/028-network-policy-generation-environment-variable.md:
--------------------------------------------------------------------------------
1 | # Network Policy Generation Environment Variable
2 |
3 | The feature would consist of a new Environment Variable which can be used to disable the generation of Network Policies by Strimzi. This will be useful when a user wishes to set up a custom Network Policy set.
4 |
5 | ## Current situation
6 |
7 | Currently, Strimzi generates Network Policies which allow all pods in the deployment to connect to Kafka, Zookeeper and any other service that it has generated.
8 |
9 | There is no way to stop this generation.
10 |
11 | It is possible, in the select case of listeners, to further restrict the Ingress using NetworkPolicyPeers, however this does not extend to all Network Policy generation, for example Metrics.
12 | It is also not possible to set up Egress Network Policies.
13 |
14 | ## Motivation
15 |
16 | The current Network Policies that are generated are great for getting Strimzi up and running in a deployment where a deny-all network policy is in place.
17 |
18 | However, this does not allow users of Strimzi to implement a custom Network Policy set, due to Strimzi not providing the full feature set of Kubernetes Network Policies.
19 |
20 | By adding an Environment Variable, it allows users to disable this generation, and instead write fully custom Network Policies.
21 |
22 | When disabled the responsibility of setting up secure network policies is that of the user, and in environments where a deny-all network policy exists Strimzi will not operate until these are correctly setup.
23 | This will be documented as to which Network Policies exist and would have to be created.
24 |
25 | This setup is valid in 2 situations:
26 |
27 | * A deployment with no deny-all Network Policy, where all pods can already communicate with each other.
28 | * A deployment where the user wishes to write their own custom Network Policies.
29 | * A user may have strict guidelines on the network policy requirements which differ from Strimzi's generated policies.
30 |
31 | ## Proposal
32 |
33 | The proposal would be to introduce an Environment Variable which would provide the ability to turn off the generation of Network Policies by Strimzi.
34 |
35 | As the functionality already exists to generate Network Policies, a sensible naming of this Environment Variable would be `STRIMZI_NETWORK_POLICY_GENERATION` which by default is `true`.
36 | If users require custom Network Policies, they can disable this by setting the value to `false`.
37 |
38 | When this Environment Variable gets set to disabled, all generation of Network Policies from the Strimzi Operator will be disabled.
39 |
40 | This extends to:
41 |
42 | * CruiseControl
43 | * KafkaCluster
44 | * KafkaConnectCluster
45 | * ZookeeperCluster
46 | * KafkaMirrorMaker2
47 |
48 | When enabled, as is the default, the Network Policy Generation will act as it does currently, creating Network Policies for the Strimzi services.
49 |
50 | This will leave Strimzi in a position to support all setups of Network Policy. This means the native support by Strimzi will be
51 | "By default generation of Network Policies is enabled, limiting on port and in the case of listeners Ingress NetworkPolicyPeers. All other Network Policy setups can be achieved by disabling the automatic generation and adding Network Policies manually.".
52 |
53 | ## Affected/not affected projects
54 |
55 | This only impacts the [strimzi-kafka-operator](https://github.com/strimzi/strimzi-kafka-operator) project, which currently generates these network policies.
56 |
57 | ## Compatibility
58 |
59 | By setting this Environment Variable to `true` by default, this maintains existing compatibility.
60 |
61 | ## Rejected alternatives
62 |
63 | ### Embedding full Network Policy syntax into Strimzi
64 |
65 | The alternative to adding an Environment Variable was to attempt to support the full functionality of Network Policies within Strimzi.
66 | The origin requests for this ticket was to allow more strict rules for the Metrics ports, however further requests also exists for adding Egress support in these Network Policies.
67 |
68 | However, this was rejected because this introduces a lot of customisation that will not be used by the majority of users.
69 | When a user is expecting custom network policies, they are most likely accepting of having to write these themselves, rather than Strimzi having to recreate the same syntax already present in Network Policies.
70 |
71 | ### Individual control of network policy generation for Strimzi components
72 |
73 | A suggestion was to introduce a Feature Flag for each component of the product. This would indeed fulfil the requirements.
74 |
75 | However, this was rejected because the use of custom rules is most likely to come from a rule at the user that will apply over their entire product.
76 | For example: "All Ingress Network Policies must be restricted by `ports` and `from`."
77 |
78 | Conforming to whatever rule the user has in place will likely mean they are required to turn off all generation and write their own rules.
79 | So in the majority of cases this turns into an all or nothing switch, reducing the value of individual switches.
80 |
--------------------------------------------------------------------------------
/030-env-var-config-provider.md:
--------------------------------------------------------------------------------
1 | # EnvVar Configuration Provider for Apache Kafka
2 |
3 | This proposal suggests to create a configuration provider for reading data from environment variables.
4 |
5 | ## Current situation
6 |
7 | Apache Kafka supports pluggable configuration providers for loading configuration data from external sources.
8 | They can be used in the configuration of the different components.
9 | By default, Kafka provides two configuration providers:
10 |
11 | * `FileConfigProvider` for reading configuration records, as key-value pairs, from a single properties file
12 | * `DirectoryConfigProvider` for reading one or more configuration files from a specified directory (files are read from start to finish)
13 |
14 | In addition, users can implement their own configuration providers.
15 |
16 | Strimzi already has one custom configuration provider.
17 | [Kubernetes Configuration Provider](https://github.com/strimzi/kafka-kubernetes-config-provider) can be used get data from Secrets or Config Maps using Kubernetes client directly from the Kubernetes API.
18 | This was proposed in [Strimzi Proposal #027](https://github.com/strimzi/proposals/blob/main/027-kubernetes-config-provider.md).
19 |
20 | ## Motivation
21 |
22 | One of the common ways how to configure applications running on Kubernetes is using environment variables.
23 | Environment variables can be set in the Pods (or Deployments, StatefulSets etc.).
24 | The values for the environment variables can be either set directly or mapped from other resources.
25 | For example from Secrets, Config Maps or from the Downward API.
26 | Using environment variables for application configuration also corresponds to the [12 Factor principles](https://12factor.net/config).
27 |
28 | However, most Kafka components are configured from a properties file and not from environment variables.
29 | And Kafka does not support any easy way how to set the individual properties to values from environment variable directly inside the properties file.
30 | That means that in order to use the environment variables, you have to generate the properties file and insert the values from the environment variables into it.
31 | This is something we do in our own bash scripts in our container images as well.
32 |
33 | Having a configuration provider which would load the environment variables might help us to simplify our bash scripts.
34 | The whole configuration file might be for example generated by the operator.
35 | Instead of the references which it contains today and which are replaced by the environment variables in the bash scripts, it might call the configuration provider directly and no replacements will be needed later in bash.
36 | Doing more work in Java instead of in bash would make the code cleaner and testing easier.
37 | It would also simplify the _API_ between the operator and the container images.
38 |
39 | The configuration provider might be of course useful also to Strimzi users or to Apache Kafka users in general and not just to us.
40 | Users might for example use it to load values from environment variables in connector configurations or in client configurations.
41 |
42 | ## Proposal
43 |
44 | We should create a new configuration provider which would load data from environment variables.
45 | To use the configuration provider, the user will need to configure their client like this:
46 |
47 | ```properties
48 | config.providers=env
49 | config.providers.env.class=io.strimzi.kafka.EnvVarConfigProvider
50 | option1=${env::FIRST_ENV_VAR}
51 | option2=${env::SECOND_ENV_VAR}
52 | ```
53 |
54 | The configuration provider should use a separate GitHub repository under the Strimzi GitHub organization.
55 | The repository should be named `kafka-env-var-config-provider`.
56 | It should be published to Maven Central as `io.strimzi:kafka-env-var-config-provider`.
57 | It should be also bundled with all out images (Kafka, Kafka Bridge, etc.).
58 |
59 | ## Compatibility
60 |
61 | The configuration provider should be compatible with all recent Apache Kafka versions.
62 |
63 | ## Affected components
64 |
65 | This is a separate component which should make it easier for users to use Strimzi based Apache Kafka clusters from their applications.
66 | But it has currently no direct impact on the other resources.
67 |
68 | ## Rejected alternatives
69 |
70 | No other alternatives were considered.
71 |
--------------------------------------------------------------------------------
/034-deprecate-and-remove-mirror-maker-2-extensions.md:
--------------------------------------------------------------------------------
1 | # Deprecate and remove MirrorMaker 2 extensions
2 |
3 | ## Current situation
4 |
5 | When Mirror Maker 2 was first introduced, it was missing a policy for replicating topics between two Kafka clusters without changing their names.
6 | We created the [_Mirror Maker 2 Extensions_ project](https://github.com/strimzi/mirror-maker-2-extensions) which contains the `IdentityReplicationPolicy` policy.
7 | This policy makes sure that topics are not renamed while being mirrored.
8 | The `IdentityReplicationPolicy` is the only _extension_ we provide as part of this project.
9 |
10 | In Kafka 3.0.0, Mirror Maker 2 introduces its own `IdentityReplicationPolicy`.
11 | This was done in [KAFKA-9726](https://issues.apache.org/jira/browse/KAFKA-9726).
12 | This policy provides the same features as Strimzi's own `IdentityReplicationPolicy`.
13 | It does not seem to make sense to maintain our own policy in parallel to Apache Kafka.
14 |
15 | ## Proposal
16 |
17 | After dropping support for Kafka versions older than 3.0.0 (expected to happen in Strimzi 0.28.0), we should deprecate our own policy and migrate users to Kafka's identity replication policy.
18 |
19 | The following steps should be taken as part of the 0.28.0 release:
20 | * Update our documentation to use Kafka's `IdentityReplicationPolicy`.
21 | * Update our examples to use Kafka's `IdentityReplicationPolicy`.
22 | * Update the `CHANGELOG.md` file to indicate the deprecation.
23 | * Release a new version of the MM2 Extensions, which would extend Kafka's `IdentityReplicationPolicy` instead of implementing our own logic.
24 | The change to the extensions project should ensure exactly the same behavior from both policies.
25 | It would also make sure that any users still using `io.strimzi.kafka.connect.mirror.IdentityReplicationPolicy` do not have any problems
26 | * After the release, clearly deprecate and archive the `mirror-maker-2-extensions` repository.
27 | (If we ever need any other MM2 extension in the future, we can always unarchive it)
28 |
29 | Even in Strimzi 0.28 and newer releases, we will still include the `mirror-maker-2-extensions` JARs to ensure compatibility.
30 | The JAR will be removed only in Strimzi 0.32.0 (i.e. after 4 releases) or Strimzi 1.0.0 (whichever comes first).
31 | From that version on, users will need to use the Apache Kafka policy in their configuration as `org.apache.kafka.connect.mirror.IdentityReplicationPolicy`.
32 |
33 | Any users using the Mirror Maker 2 Extensions with older versions of Apache Kafka outside of Strimzi will still be able to get the previously released versions from our Maven repositories.
34 | Also, anyone building an older version of Strimzi will be able to use the older versions and will not be impacted by the archiving of the GitHub repository.
35 |
36 | ## Affected projects
37 |
38 | This proposal impacts the Mirror Maker 2 Extensions and Strimzi Kafka Operators projects.
39 |
40 | ## Backwards compatibility
41 |
42 | The impact on backwards compatibility is described in the proposal.
43 |
44 | ## Rejected alternatives
45 |
46 | Keep maintaining our own `IdentityReplicationPolicy` in parallel with Kafka.
47 |
--------------------------------------------------------------------------------
/038-optimization-proposal-autoapproval.md:
--------------------------------------------------------------------------------
1 | # Auto-approval mechanism for optimization proposals
2 |
3 | This proposal is about allowing the user to create a `KafkaRebalance` custom resource for getting an optimization proposal and running a rebalance, but without the need for the manual approval via the `strimzi.io/rebalance=approve` annotation.
4 | This means that the user can simply create the `KafkaRebalance` custom resource and the corresponding generated optimization proposal will be approved automatically.
5 |
6 | ## Current situation
7 |
8 | Currently, when the users want to run a cluster rebalance, they have to create a `KafkaRebalance` custom resource in order to generate an optimization proposal first.
9 | After getting the proposal, the only way to start the rebalancing operation is by approving it by annotating the custom resource with the `strimzi.io/rebalance=approve` annotation.
10 | There is the need for a manual intervention of the user and this is a two steps process.
11 |
12 | ## Motivation
13 |
14 | In some cases, the manual approval process of the rebalance proposal is not necessary.
15 | Automatically approving an optimization proposal can save time from an operational point of view.
16 | It enables more automation where just creating a `KafkaRebalance` custom resource can go straight to the cluster rebalance without any additional manual interaction.
17 |
18 | ## Proposal
19 |
20 | The `KafkaRebalance` custom resource can be annotated with a new `strimzi.io/rebalance-auto-approval` annotation for this purpose.
21 |
22 | ### Auto-approval
23 |
24 | The auto-approval can be enabled just annotating the `KafkaRebalance` custom resource with the `strimzi.io/rebalance-auto-approval=true` annotation.
25 |
26 | ```yaml
27 | apiVersion: kafka.strimzi.io/v1beta2
28 | kind: KafkaRebalance
29 | metadata:
30 | name: my-rebalance
31 | labels:
32 | strimzi.io/cluster: my-cluster
33 | annotations:
34 | strimzi.io/rebalance-auto-approval: true
35 | spec:
36 | mode: # any mode
37 | # ...
38 | ```
39 |
40 | The reason for not having a simple boolean `spec.autoApproval` field is for leaving space for a future extension, maybe adding some criteria or rules that have to be matched for auto-approving the optimization proposal.
41 | It allows the community to take more time to understand if such a support is really useful and the direction to go.
42 |
43 | ### No auto-approval (default)
44 |
45 | If the `strimzi.io/rebalance-auto-approval` annotation is not specified at all, the default behavior will be the current one, so the need for manual approval by the user annotating the `KafkaRebalance` custom resource with the `strimzi.io/rebalance=approve` annotation.
46 | Of course, the annotation `strimzi.io/rebalance-auto-approval` can be set to `false` to get the same result.
47 |
48 | ### Flow
49 |
50 | As described before, the user interaction flow assumes that the `strimzi.io/rebalance-auto-approval` annotation is specified in the `KafkaRebalance` custom resource.
51 |
52 | 1. The user creates a `KafkaRebalance` custom resource.
53 | 2. The cluster operator asks Cruise Control to generate an optimization proposal via the REST API.
54 | 3. When the optimization proposal is ready, the cluster operator checks if the user has specified the `strimzi.io/rebalance-auto-approval` annotation.
55 | * If the `strimzi.io/rebalance-auto-approval` annotation is not specified at all or it is set to `false`, the user has to approve the proposal manually as usual.
56 | * If the `strimzi.io/rebalance-auto-approval` is specified and set to `true`, the cluster operator approves the proposal automatically by annotating the `KafkaRebalance` custom resource with the `strimzi.io/rebalance=approve` annotation.
57 |
58 | ## Affected/not affected projects
59 |
60 | This proposal impacts the Strimzi Cluster Operator only and mostly the `KafkaRebalanceAssemblyOperator`.
61 |
62 | ## Compatibility
63 |
64 | The manual approval will be still in place as it is today.
65 | As described before, if the `strimzi.io/rebalance-auto-approval` annotation is not specified or set to `false` in the `KafkaRebalance` custom resource, the default behavior will be the current one, so the need for manual approval by the user.
66 |
67 | ## Rejected alternatives
68 |
69 | ### Using `spec.autoApproval` boolean field
70 |
71 | Having a simple boolean `spec.autoApproval` field was rejected because it is possible that we could need more extensibility for the future if it's needed to add more configuration like for example criteria or rules to be met for auto-approving the proposal.
72 | Using a boolean, would have need an additional field for that like `spec.autoApprovalRules`.
73 | It is anyway possible that if, at some point, the community agree that criteria and rules won't be supported anymore, the proposed annotation will be promoted to be such a boolean field.
74 |
75 | ### Using `spec.autoApproval` object field
76 |
77 | Having a more complex `spec.autoApproval` field was rejected because currently there is no clear plan about how supporting criteria or rules for the auto-approval process.
78 | It is actually not clear if we want them and what is the right shape.
79 | Going through an annotation for now allows to use the feature but having more time to think about the possible criteria and rules support.
80 | Even in this case, it is possible that the current proposed annotation will be promoted in a such more complex field to allow more configuration related to the auto-approval process.
81 |
--------------------------------------------------------------------------------
/039-reduce-test-clients-images.md:
--------------------------------------------------------------------------------
1 | # Reduce Strimzi test-client's images
2 |
3 | This proposal suggests reducing Strimzi test-client's images.
4 |
5 | ## Current situation
6 |
7 | Test-clients repository contains 2 HTTP - producer & consumer - and 4 Kafka - producer, consumer, admin & streams - clients.
8 | For each client we are building separate image.
9 | Kafka clients are then build for each supported Kafka version, which are specified in [`kafka.version`](https://github.com/strimzi/test-clients/blob/main/docker-images/kafka.version) file.
10 | That means we can end up with 10 images for a release, when we are supporting 2 Kafka versions.
11 | Image base is same for all the clients, only difference is used `jar`.
12 |
13 | ## Motivation
14 |
15 | During releases of test-clients, we found few difficulties with current implementation.
16 | The dependencies are mostly the same for all the clients.
17 | After we added support for new architectures (`ppc64le`, `s390x`, `arm64`), the whole build process became chaotic.
18 | At the same time, in our `systemtest` module, we need to specify image for each client.
19 |
20 | ## Proposal
21 |
22 | Client's code bases will remain the same as they are.
23 | The image build process will be different.
24 | Both HTTP and Kafka clients will be inside one image.
25 |
26 | ### Image content
27 |
28 | * `/opt/test-clients` - base folder for test-clients
29 | * `/opt/test-clients/bin` - scripts for running each client
30 | * f.e: `http_producer_run.sh`
31 | * scripts will contain correct classpath, Java options and path to main class
32 | * `/opt/test-clients/lib` - all needed dependencies for running the clients
33 |
34 | The approach will be similar to the `operator`'s image inside Strimzi.
35 |
36 | ### Image name
37 |
38 | Current pattern:
39 |
40 | * `quay.io/strimzi-test-clients/test-client-kafka-{client-type}:{version}-kafka-{kafka-version}`
41 | * `quay.io/strimzi-test-clients/test-client-http-{client-type}:{version}`
42 |
43 | will be changed to `quay.io/strimzi-test-clients/test-clients:{version}-kafka-{kafka-version}`.
44 |
45 | ### `Systemtest` implementation and changes
46 |
47 | * Environment variables will be reduced
48 | * `TEST_PRODUCER_IMAGE`, `TEST_CONSUMER_IMAGE` and other environment variables will be removed
49 | * `TEST_CLIENT_IMAGE` will be added
50 | * Container configuration of each client will contain `args` section with appropriate `*_run.sh`
51 | ```yaml
52 | args:
53 | - /opt/test-clients/bin/http_producer_run.sh
54 | ```
55 |
56 | ## Advantages
57 |
58 | * One image instead of six (for one Kafka version)
59 | * Easier and clearer building system
60 | * Simple usage in STs
61 |
62 | ## Affected/not affected projects
63 |
64 | [Test-clients repository](https://github.com/strimzi/test-clients/) and `systemtest` module in operators repository.
65 |
66 | ## Rejected alternatives
67 |
68 | There are no rejected alternatives at the moment.
--------------------------------------------------------------------------------
/040-refactor-client-examples.md:
--------------------------------------------------------------------------------
1 | # Refactor KafkaConfig files in Strimzi Client Examples
2 |
3 | This proposal is about simplifying the `Kafka*Config` files of the Java modules of the [Strimzi Client Examples repo](https://github.com/strimzi/client-examples).
4 |
5 | ## Current situation
6 |
7 | The `Kafka*Config` files of the Java modules in the [Strimzi Client Examples repo](https://github.com/strimzi/client-examples) have become quite complex since they support the configuration of several specific Kafka client properties via environment variables.
8 | They contain a lot of the logic for configuring the client.
9 | As an example, you can have a look at the logic around setting Kafka’s `security.protocol` option which is spread across several places in the code.
10 |
11 | ## Motivation
12 |
13 | The main purpose of the Client Examples is to show users how to write a simple Kafka client based on the Kafka Consumer, Producer and Streams API.
14 | However, over time, we added a lot of complexity to the client code because it was being used in the System Tests at some point in time.
15 | So a user who wants to learn from the Client Examples has to dig through this code to understand how they are configured.
16 | This is currently not easy because of how the configuration is generated through the Java code.
17 | In addition to that, we found adding Kafka client configuration updates to the `Kafka*Config` files of the Java modules in the [Strimzi Client Examples repo](https://github.com/strimzi/client-examples) to be a little more verbose and messier than necessary.
18 | The current method requires creating a field, getter, setter, and hard coded String per new Kafka client config field.
19 | This method does not scale well and makes for a longer and messier class.
20 | So for any users who want to write their own Kafka client in Java, this is neither a good inspiration nor good code to copy.
21 |
22 | ## Proposal
23 |
24 | We could greatly reduce the complexity and size of the class if we standardize the naming scheme of environment variables used to configure the clients of the [Strimzi Client Examples repo](https://github.com/strimzi/client-examples).
25 |
26 | We could standardize the env vars used to configure the Kafka client properties in the following manner:
27 | ```
28 | KAFKA_
29 | ```
30 | For example:
31 | ```
32 | KAFKA_BOOTSTRAP_SERVERS for `bootstrap.servers` property
33 | KAFKA_GROUP_ID for `group.id` property
34 | …
35 | ```
36 | Then we could create a properties file by looping through the environment variables and translating them into a properties configuration like so:
37 | ```
38 | Properties prop = new Properties();
39 | HashMap envVars = System.getEnv();
40 | for (Map.Entry entry : map.entrySet()) {
41 | String key = convertEnvVarToPropertyKey(entry.getKey());
42 | String value = entry.getValue();
43 | prop.put(key, value);
44 | }
45 | ```
46 | This removes the need for fields, getters, setters, and hard coded Strings for specific Kafka client properties.
47 | Note that the configuration options which cannot be directly configured as Kafka configuration properties would be configured with environment variables starting with a `STRIMZI_` prefix.
48 | These would be handled directly in the Java code.
49 | This change would include configuration options for things like tracing initialization (interceptors can be set through `KAFKA_` variables, but the tracer would still need to be initialized).
50 | For example:
51 | ```
52 | STRIMZI_TOPIC for topic configuration
53 | STRIMZI_TRACING_SYSTEM for tracingSystem configuration
54 | ...
55 | ```
56 | However, other functionality that does not cover the basic example use-case like blocking the producer or transactions support used for the Strimzi system tests in the past would be removed.
57 | Thanks to these changes, the majority of the configuration would take place in the YAML deployment files, be easier to read for the users, and be easier to _translate_ to other clients.
58 |
59 | ## Affected projects
60 |
61 | The [Strimzi Client Examples repo](https://github.com/strimzi/client-examples)
62 |
63 | ## Rejected alternatives
64 |
65 | Passing the configuration as a single properties file was considered.
66 | It would make the configuration more readable as there will be no transformation to the property names as we would do with the environment variables.
67 | The properties file can be passed from Config Map as a volume or as an environment variable (either from the ConfigMap or have it defined in the YAML directly).
68 | However, this seems to be less _Kubernetes native_ than using the environmental variables. So we rejected this alternative.
69 |
70 |
71 |
--------------------------------------------------------------------------------
/041-user-operator-configurable-exclusion-of-labels.md:
--------------------------------------------------------------------------------
1 | # User Operator: Configurable exclusion of labels
2 |
3 | This proposal is about adding the ability to filter the automatically assigned labels.
4 | When User Operator reconciles a `KafkaUser`, it also creates an associated Secret, where it automatically adds the label
5 | `app.kubernetes.io/instance: `.
6 | Such a label could be used for many uses, and different users have various requirements and expectations.
7 | Nevertheless, it could also lead to undesirable scenarios (e.g., [repeated deletion and re-creation of specific Secret](https://github.com/strimzi/strimzi-kafka-operator/issues/5690)).
8 | Therefore, we propose to make exclusion of labels configurable.
9 |
10 | ## Current situation
11 |
12 | Presently, when a user attempts to create a `KafkaUser` defined as follows:
13 | ```yaml
14 | apiVersion: kafka.strimzi.io/v1beta2
15 | kind: KafkaUser
16 | metadata:
17 | name: my-user
18 | labels:
19 | strimzi.io/cluster: my-cluster
20 | spec:
21 | authentication:
22 | type: tls
23 | ```
24 | The User Operator then creates an associated Kubernetes `Secret`.
25 | ```yaml
26 | kind: Secret
27 | metadata:
28 | labels:
29 | app.kubernetes.io/instance: my-user
30 | app.kubernetes.io/managed-by: strimzi-user-operator
31 | app.kubernetes.io/name: strimzi-user-operator
32 | app.kubernetes.io/part-of: strimzi-my-user
33 | strimzi.io/cluster: my-cluster
34 | strimzi.io/kind: KafkaUser
35 | name: my-user
36 | namespace: myproject
37 | ...
38 | ```
39 | We can see that User Operator automatically adds the `app.kubernetes.io/instance: my-user` label.
40 | If we do not want the label that has been assigned, the only way to get rid of it is to filter it using regexes.
41 |
42 | ## Proposal
43 |
44 | This proposal suggests adding configurable exclusion of labels.
45 | We can implement this feature similarly to [issue 4394](https://github.com/strimzi/strimzi-kafka-operator/pull/4791).
46 | We can create an environment variable and inject the value through the `Kafka` custom resource.
47 | Specifically, in the `spec.entityOperator.template.userOperatorContainer.env`.
48 | For instance, we can have the following `Kafka` resource:
49 | ```yaml
50 | apiVersion: kafka.strimzi.io/v1beta2
51 | kind: Kafka
52 | metadata:
53 | name: my-cluster
54 | spec:
55 | ...
56 | entityOperator:
57 | template:
58 | userOperatorContainer:
59 | env:
60 | name: STRIMZI_LABELS_EXCLUSION_PATTERN
61 | value: "^app.kubernetes.io/.*$"
62 | topicOperator: {}
63 | userOperator: {}
64 | ```
65 | Then, we would obtain a value from the environment variable and parse it in the `UserOperatorConfig` class.
66 | Based on the environment variable value, (a) when `env` is not specified, we do not exclude any label from the secret
67 | (b) when `env` is specified by a regex, we filter and remove those labels which match the regex.
68 |
69 | ### More implementation details
70 |
71 | #### Parsing part
72 |
73 | As we mentioned details about the `UserOperatorConfig` class, we will elaborate more on the specific details in this section.
74 | Moreover, we will obtain the environment variable inside the `fromMap` method used to construct the class.
75 | ```java
76 | public static UserOperatorConfig fromMap(Map map) {
77 | ...
78 | //
79 | String strimziLabelsExclusionPatternEnvVar = map.get(UserOperatorConfig.STRIMZI_LABELS_EXCLUSION_PATTERN);
80 |
81 | if (strimziLabelsExclusionPattern != null) {
82 | // note that we will compile such regex into FSM only once here and thus eliminate workload inside KafkaUserModel
83 | strimziLabelsExclusionPattern = Pattern.compile(strimziLabelsExclusionPatternEnvVar);
84 | }
85 | ...
86 | }
87 | ```
88 |
89 | #### Exclusion part
90 |
91 | The exclusion part will be placed in the `KafkaUserModel` class.
92 | Moreover, we would need to get the value of the environment variable to such a class.
93 | In the `UserOperatorConfig` class, we will obtain the environment variable value, and then in the `KafkaUserOperator` class, specifically in the following method:
94 | ```java
95 | protected Future createOrUpdate(Reconciliation reconciliation, KafkaUser resource) {
96 | KafkaUserModel user;
97 | KafkaUserStatus userStatus = new KafkaUserStatus();
98 |
99 | try {
100 | user = KafkaUserModel.fromCrd(resource, config.getSecretPrefix(), config.isAclsAdminApiSupported(),config.isKraftEnabled(),
101 | config.getStrimziLabelsExclusionPattern); // <-- this one we will inject into KafkaUserModel class
102 | } catch (Exception e) {
103 | StatusUtils.setStatusConditionAndObservedGeneration(resource, userStatus, Future.failedFuture(e));
104 | return Future.failedFuture(new ReconciliationException(userStatus, e));
105 | }
106 | ...
107 | }
108 | ```
109 | Then, we can use the environment variable value to store it as an instance attribute of `KafkaUserModel` and implement pre-processing of labels in the
110 | following method:
111 | ```java
112 | protected Secret createSecret(Map data) {
113 | final Map labels = Util.mergeLabelsOrAnnotations(labels.toMap(), templateSecretLabels);
114 | // here, we have to do pre-processing (i.e., filtering) of labels by exclusion pattern
115 | // filter by value of instance variable `this.getStrimziLabelsExclusionPattern`
116 |
117 | return return new SecretBuilder()
118 | .withNewMetadata()
119 | .withName(getSecretName())
120 | .withNamespace(namespace)
121 | .withLabels(labels)
122 | .withAnnotations(Util.mergeLabelsOrAnnotations(null, templateSecretAnnotations))
123 | .withOwnerReferences(createOwnerReference())
124 | .endMetadata()
125 | .withType("Opaque")
126 | .withData(data)
127 | .build();
128 | }
129 | ```
130 |
131 | ## Compatibility
132 |
133 | This proposal does not change any of the existing CRDs or the Kubernetes secrets that are being created.
134 | The user only needs to modify either User Operator deployment in the case of a standalone approach or Kafka's custom resource to use such a feature.
135 |
136 | ## Rejected alternatives
137 |
138 | 1. We considered the alternative to remove such labels entirely simply. However, this could lead to un-excepted behaviour (i.e., breaking any users relying on them).
139 |
--------------------------------------------------------------------------------
/042-remove-bridge-amqp-support.md:
--------------------------------------------------------------------------------
1 | # Remove AMQP 1.0 support from the Strimzi bridge
2 |
3 | This proposal is about removing the current support for the AMQP 1.0 protocol from the Strimzi bridge, leaving just the HTTP support.
4 |
5 | ## Current situation
6 |
7 | Currently the Strimzi bridge provides support for the AMQP 1.0 protocol other than HTTP.
8 | The AMQP 1.0 protocol support is provided by using the [Vert.x Proton](https://github.com/vert-x3/vertx-proton) component for handling the communication on the wire.
9 | The current implementation comes from a custom design in terms of usage of AMQP 1.0 message properties, addresses and so on but not following any specific standard, like the new available [Event Stream Extensions for AMQP Version 1.0](https://docs.oasis-open.org/amqp/event-streams/v1.0/csd01/event-streams-v1.0-csd01.html).
10 |
11 | ## Motivation
12 |
13 | The AMQP 1.0 protocol support seems not to be used extensively by the Strimzi community.
14 | We haven't ever seen users opening GitHub issues or discussions about adding new features or fixing bugs on the AMQP 1.0 part.
15 | The same applies on the CNCF Slack #strimzi channel and the mailing lists.
16 | Even asking users about their AMQP 1.0 usage didn't get any answer.
17 | It seems that the Strimzi bridge is mostly used for its HTTP protocol support, which is where we see more engagement from the users.
18 | Currently the AMQP 1.0 support offers no benefit to users and comes with costs to the project:
19 |
20 | * costs in terms of a more complex bridge architecture, requiring abstraction over HTTP and AMQP protocols.
21 | * on-going cost for testing.
22 | * an extra Vert.x dependency, the Vert.x Proton component.
23 |
24 | Removing it allows to reduce the dependencies surface which could be impacted by bugs and CVEs.
25 |
26 | ## Proposal
27 |
28 | This proposal suggests to remove the AMQP 1.0 protocol support from the Strimzi bridge.
29 | It involves removing different parts of the codebase:
30 |
31 | * The specific AMQP 1.0 related part in the `io.strimzi.kafka.bridge.amqp` package.
32 | * The part which is used for tracking offsets for AMQP 1.0 in the `io.strimzi.kafka.bridge.tracker` package.
33 | * Simplifying or even removing the `SinkBridgeEndpoint` and `SourceBridgeEndpoint` classes that currently are used as a common layer across AMQP 1.0 and HTTP for interacting with the Apache Kafka cluster.
34 | * All the corresponding tests.
35 | * All the documentation about the AMQP 1.0 support design and usage (see `amqp` folder) and configuration for using the [Qpid Dispatch Router](https://qpid.apache.org/components/dispatch-router/index.html) with it (see `qdrouterd` folder).
36 |
37 | In the future, but without any actual roadmap or ETA for it, we could come back to have a support for AMQP 1.0 to Apache Kafka bridging with a separate component under the Strimzi organization and by implementing the [Event Stream Extensions for AMQP Version 1.0](https://docs.oasis-open.org/amqp/event-streams/v1.0/csd01/event-streams-v1.0-csd01.html) OASIS standard.
38 |
39 | ## Affected/not affected projects
40 |
41 | This proposal affects the Strimzi bridge project only.
42 | The Strimzi operator is not affected because currently it is not possible to enable the AMQP 1.0 support on the bridge through the `KafkaBridge` custom resource.
43 |
44 | ## Compatibility
45 |
46 | When the removal is completed, the users won't be able to use the AMQP 1.0 protocol support with future bridge releases.
47 | If they really need that, the only way is to stick with older bridge versions.
48 |
49 | ## Rejected alternatives
50 |
51 | No rejected alternatives to mention.
52 |
--------------------------------------------------------------------------------
/043-deprecate-and-remove-jmxtrans.md:
--------------------------------------------------------------------------------
1 | # Deprecate and remove JMX Trans
2 |
3 | ## Current situation
4 |
5 | [JMX Trans](https://github.com/jmxtrans/jmxtrans) is a tool which allows data collection from JMX endpoints of Java applications to send them to other applications and services.
6 | Strimzi integrates JMX Trans as part of the `Kafka` custom resource.
7 | You can configure it in `.spec.jmxTrans` section:
8 |
9 | ```yaml
10 | apiVersion: kafka.strimzi.io/v1beta2
11 | kind: Kafka
12 | metadata:
13 | name: my-cluster
14 | spec:
15 | # ...
16 | jmxTrans:
17 | outputDefinitions:
18 | - outputType: "com.googlecode.jmxtrans.model.output.StdOutWriter"
19 | name: "standardOut"
20 | - outputType: "com.googlecode.jmxtrans.model.output.GraphiteOutputWriter"
21 | host: "mylogstash.com"
22 | port: 31028
23 | flushDelayInSeconds: 5
24 | name: "logstash"
25 | kafkaQueries:
26 | - targetMBean: "kafka.server:type=BrokerTopicMetrics,name=*"
27 | attributes: ["Count"]
28 | outputs: ["standardOut"]
29 | # ...
30 | ```
31 |
32 | ## Issues
33 |
34 | The JMX Trans tool seems to be stale.
35 | The last release is from March 31st 2021 - so more than year and a half ago.
36 | Because the last release is so old, there are also many CVEs in the different dependencies it uses.
37 | In addition to that, it is falling behind in other aspects as well.
38 | For example, while we are moving all our container images to Java 17, JMX Trans does not run on Java 17, so it needs to stick with Java 11.
39 |
40 | ## Proposal
41 |
42 | Since the JMX Trans project does not seem to be developed anymore, we should first deprecate the JMX Trans support and if nothing changes, we should remove the support.
43 |
44 | In the first phase - as part of Strimzi 0.33 (currently expected in the second part of December or early January) - we will:
45 | * Deprecate the `.spec.jmxTrans` API in the Kafka custom resource.
46 | * Update the docs to indicate that JMX Trans is deprecated.
47 | * Announce the deprecation to the users as part of the Strimzi 0.33 communication (release notes, change log etc.).
48 |
49 | In the second phase - as part of Strimzi 0.35 (currently expected to be release in March or April 2023) - we will:
50 | * Retain the `.spec.jmxTrans` API in the Kafka custom resource.
51 | But it will stay deprecated and will be ignored by the operator (a warning will be issued if it is present in the custom resource).
52 | * The Strimzi 0.35 and later will check for existence of the JMX Trans resources and delete them if they would exist.
53 | The rest of the operator code related to JMX Trans will be removed.
54 | * The container image for JMX Trans will be removed.
55 | * Remove JMX Trans from the docs.
56 |
57 | In the final phase - as part of Strimzi 0.40 - we will:
58 | * Completely remove the operator functionality which checks for the JMX Trans resources and delete them.
59 | Anyone who upgrades from Strimzi 0.34 or earlier to Strimzi 0.40 or later and had enabled JMX Trans will have to delete the resources manually.
60 |
61 | The `.spec.jmxTrans` API in the Kafka custom resource will be removed in the next version of the API (either `v1` or `v1beta3`) as it cannot be removed earlier for backwards compatibility reasons.
62 |
63 | If the JMX Trans project happens to revived between the initial phase and the second phase - either as the original project or as a fork - we can un-deprecate the API and keep supporting it.
64 |
65 | ## Affected/not affected projects
66 |
67 | This proposal affects only the Strimzi Cluster Operator and its Kafka cluster with enabled JMX Trans.
68 | Any other users or projects are not affected.
69 |
70 | ## Compatibility
71 |
72 | This proposal suggests to deprecate and remove a currently supported feature.
73 | So all its users will be affected.
74 | Any other users - not using JMX Trans - will not be affected by this.
75 |
76 | To maintain backwards compatibility of the Custom Resource Definitions and the API they provide, the `.spec.jmxTrans` object will be still part of the API and will not be removed, only deprecated.
77 | But it will be ignored by the operator.
78 |
79 | ## Rejected alternatives
80 |
81 | ### Updating JMX Trans
82 |
83 | We could try to help to contribute to the JMX Trans project or try to fork it and maintain it our self.
84 | However, this alternative was rejected because we do not have the resources to do this ourselves.
85 |
--------------------------------------------------------------------------------
/044-StrimziPodSets-graduation.md:
--------------------------------------------------------------------------------
1 | # StrimziPodSets graduation
2 |
3 | This proposal suggests an updated schedule for StrimziPodSet graduation.
4 |
5 | ## Current situation
6 |
7 | StrimziPodSets were proposed and approved as part of the [SP-031 StatefulSet Removal proposal](https://github.com/strimzi/proposals/blob/main/031-statefulset-removal.md) (visit the proposal for more details about the StrimziPodSets).
8 | StrimziPodSets are currently behind a feature gate named `UseStrimziPodSets` which is enabled by default, but can be optionally disabled.
9 | That means that the code paths using StatefulSets are still maintained and tested.
10 |
11 | The original schedule proposed the feature gate to move to beta (enabled by default) in Strimzi 0.29 and to GA (enabled by default without possibility to disable it) in Strimzi 0.31.
12 | In reality, it moved to beta stage in Strimzi 0.30 and as of today (Strimzi 0.32) it is still in beta.
13 |
14 | The StrimziPodSets are now enabled by default for 3 releases (0.30-0.32).
15 | There are no known issues, bugs or missing features on the Strimzi side.
16 |
17 | ## Proposal
18 |
19 | This proposal suggests an updated graduation schedule:
20 | * Strimzi 0.33 and 0.34 will be released with the `UseStrimziPodSet` feature gate still in beta
21 | * Unless some major issues are found during Strimzi 0.33 life-cycle (i.e. between the release of Strimzi 0.33 and 0.34), the feature gate will move to GA right after the 0.34.0 release and the code paths related to StatefulSets will be removed.
22 | The only functionality related to StatefulSet that will remain will be for upgrading from StatefulSets to StrimziPodSets.
23 | This will be the deletion of the old resources (StatefulSets, shared ConfigMaps etc).
24 | Strimzi 0.35 will be released with StrimziPodSets feature gate in GA being permanently enabled without the possibility to disable it.
25 |
26 | Thanks to this timeline:
27 | * Users will have additional time to test the StrimziPodSets with the 0.33 release
28 | * Removing the StatefulSet support right after the 0.34 release will give us additional time to ensure that the removal was done correctly (compared to removing it just before the Strimzi 0.34 release)
29 |
30 | Assuming this proposal is approved, the 0.33 release can be used to announce this and encourage users to test the StrimziPodSets.
31 | If any major bugs are found before the 0.34 release and before the StatefulSets code is removed, the timeline can be reconsidered.
32 |
33 | Moving forward with StrimziPodSets will make it easier to continue the development of the additional features built on top of StrimziPodSets.
34 | This includes KRaft support, Node pools or in the long term stretch clusters.
35 | It will also simplify testing.
36 |
37 | ## Rejected alternatives
38 |
39 | ### Removing StatefulSets right before the 0.34 release
40 |
41 | One considered alternative was to move the feature gate to GA already as part of 0.34 release.
42 | However, this would either mean that we would remove the code _last minute_ before the release.
43 | Or we would remove it right after the 0.33 release which would mean that if any major issue is found later and we decide to change the schedule, it will be complicated to revert the changes.
--------------------------------------------------------------------------------
/049-prevent-broker-scale-down-if-it-contains-partition-replicas.md:
--------------------------------------------------------------------------------
1 |
2 | # Preventing scale-down of brokers containing partition replicas
3 | The purpose of this proposal is to introduce a mechanism that prevents brokers in a Kafka cluster from being scaled down when they are hosting partition replicas.
4 | The same mechanism can also be employed when using automatic rebalancing to scale a Kafka Cluster up or down.
5 |
6 | ## Current situation
7 |
8 | Currently, when removing brokers from the Kafka cluster, there is no check to see if these brokers contain any partition replicas.
9 | The [documentation](https://strimzi.io/docs/operators/latest/configuring.html#scaling-clusters-str) recommends that all topic-partition replicas are reassigned before doing this scaling action, which means that if someone tries to scale down without re-assigning the brokers, it can cause availability issues or data loss.
10 |
11 | ## Motivation
12 |
13 | We should introduce logic that can detect if the broker which is going to be removed still contains the partition replicas or not.
14 | If any partition replicas are still assigned to the broker, then we should get some warning in the status of the Kafka resource that will prompt users to do the reassignment and prevent the broker from being removed until the partition replicas are reassigned.
15 |
16 | ## Proposal
17 |
18 | This proposal suggest how we can add the check to detect if the broker still contains any partition replicas and what to do if the broker scale down is not possible.
19 |
20 | ## Implementation
21 |
22 | ### Process:
23 |
24 | - When the broker count is changed in the Kafka resource, the `reconcile` method of the `KafkaReconciler` will be triggered to reconcile the Kafka brokers.
25 | - The `canScaleDownBrokers()` utility method will be present at the top of the compose chain in the `reconcile()` method of the `KafkaReconciler` to make sure that every other method which requires the replica count uses the correct replica count based on the outcome of the check.
26 | - The `canScaleDownBrokers()` method will only run if we see the current Kafka replicas (replicas before the Kafka custom resource is modified) count becomes greater than the Kafka replicas present in the Kafka custom resource.
27 | We can get the desired Kafka replica count by using `kafka.getReplicas()` where `kafka` is an object of `KafkaCluster` class .
28 | - This method will check if the broker contains any partition replicas or not and will continue the process based on the outcome.
29 | - To do so, the topic metadata will be queried to detect if the broker contains any partition replicas.
30 | - An Admin client instance will be used to connect with the cluster and get us the topic details (topic name and topic description)
31 | - We can then use this information to check if the broker contains any partition replicas or not.
32 | - The scale-down is made after we make sure that the brokers that are going to be removed don't contain any partition replicas.
33 | By doing this we avoid any partial scale-down.
34 |
35 | ### What to do if a broker contains partitions?
36 |
37 | #### Flow:
38 |
39 | - If partition replicas are found out on the broker, we will revert back the Kafka replicas to the previous count by setting replicas directly in the `KafkaCluster` class using `setReplicas()` method.
40 | Changing the Kafka replica count directly in the Kafka Cluster helps to ensure that we keep the same replicas everywhere, such as when generating certificates, services, ingresses, routes etc.
41 | - The broker certificates, services, ingresses, routes etc. will be treated with the original number of nodes and the rest of the reconciliation will be done normally.
42 | - We also generate a new condition which will be added to Kafka resource status depicting that the scale down is not done. It will also contain the `spec.replicas` count(that is currently being used) in the condition message.
43 | ```yaml
44 | status:
45 | clusterId: DoRj5f84Sruq_7TJ31y7Zw
46 | conditions:
47 | - lastTransitionTime: "2023-02-22T10:18:56.578009768Z"
48 | message: 'Cannot Scale down since broker contains partition replicas. The `spec.kafka.replicas` should be reverted back to 4 directly in the Kafka resource'.
49 | reason: ScaleDownException
50 | status: "True"
51 | type: Warning
52 | - lastTransitionTime: "2023-02-22T10:18:57.664668863Z"
53 | status: "True"
54 | type: Ready
55 | ```
56 | Note: By the time the replicas are reverted back, the storage validation will be already complete based on the replica count present in Kafka custom resource.
57 | This can cause some issues if someone tries to make some forbidden changes (changes that might not be supported) to the storage during this time frame.
58 | This is hard to prevent in the current code.
59 | But the likelihood of this happening at the same time should be relatively small.
60 | So this proposal suggests to ignore this risk.
61 |
62 | ### How to bypass the broker scale down mechanism
63 |
64 | - To bypass the broker scale down mechanism you can use the annotation `strimzi.io/bypass-broker-scaledown-check: "true"` on the Kafka custom resource:
65 | ```sh
66 | kubectl annotate Kafka my-cluster strimzi.io/bypass-broker-scaledown-check: "true"
67 | ```
68 |
69 | ### Other Scenarios
70 |
71 | - During the check, if the admin client is not able to connect to the cluster (not able to get the topic details), we will update the status of the Kafka CR with the respective warning and revert back the replica count in the `KafkaCluster` class.
72 | - If the Kafka cluster is just initialized and the pods are not ready, the `canScaleDownBrokers()` utility method will not work because the current Kafka replicas (replicas before the Kafka custom resource is modified) count will be equal to 0 and the Kafka replicas present in the Kafka custom resource will also be 0. Hence, the mechanism will not run since the condition requires the current replica count to be greater than the Kafka replicas present in the Kafka custom resource
73 | - If the current Kafka replicas/pods are 0 the mechanism will not work since if there are zero brokers it will not be considered a scale-down.
74 |
75 | ## Affected/not affected projects
76 |
77 | This change will affect the Strimzi Cluster Operator module and mostly the `KafkaReconciler` class.
78 |
79 | ## Rejected alternatives
80 |
81 | No rejected alternatives.
--------------------------------------------------------------------------------
/054-stopping-kafka-connect-connectors.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Support stopping connectors
4 |
5 | This proposal aims at adding support for stopping Kafka Connect connectors.
6 |
7 | ## Current situation
8 |
9 | At the moment Strimzi allows pausing connectors. This is done by a property, `pause`, in the `KafkaConnectorSpec` schema. Whenever it is set to `true`, the connectors and its tasks are paused.
10 |
11 | For example:
12 | ```yaml
13 | spec:
14 | class: org.apache.kafka.connect.file.FileStreamSourceConnector
15 | config:
16 | file: /opt/kafka/LICENSE
17 | topic: my-topic
18 | pause: true
19 | tasksMax: 1
20 | ```
21 |
22 | ## Motivation
23 |
24 | Since Kafka 3.5, Kafka Connect connectors can also be stopped. This feature was added via [KIP-875](https://cwiki.apache.org/confluence/display/KAFKA/KIP-875%3A+First-class+offsets+support+in+Kafka+Connect). Stopping a connector is done by using a new REST endpoint `PUT /connectors/{connector}/stop`. Compared to paused where the connector and tasks stay instantiated, when stopped only the configuration of the connector is kept but nothing is actually running.
25 |
26 | A paused connector still uses some resources but it's faster to resume. So using pause is well-suited if you want to halt data flowing for a short duration. For longer durations it could be beneficial to free up memory and other resources by fully stopping the connector with the new API. The stopped state will be required to use the offset delete/reset endpoints that should come with Kafka 3.6. For these reasons, Strimzi should support both pausing and stopping connectors.
27 |
28 | The `PUT /connectors/{connector}/resume` REST endpoint is used to restart both paused and stopped connectors.
29 |
30 | ## Proposal
31 |
32 | Since a connector can't be paused and stopped at the same time, the proposal is to replace the current `pause` property and add a new property, `state`, in the `AbstractConnectorSpec` schema. That new property will accept the `paused`, `stopped` or `running` values.
33 |
34 | The `pause` property will be marked as deprecated. In the next Strimzi API version, the `pause` field will be deleted.
35 |
36 | This feature requires at least Kafka 3.5 to work. In case it is merged when Strimzi still support older releases, a note in the documentation will be added to notify of this limitation. When trying to stop a connector with Kafka 3.4, the connector will instead be paused.
37 |
38 | Example YAML for stopping a connector:
39 | ```yaml
40 | spec:
41 | class: org.apache.kafka.connect.file.FileStreamSourceConnector
42 | config:
43 | file: /opt/kafka/LICENSE
44 | topic: my-topic
45 | state: stopped
46 | tasksMax: 1
47 | ```
48 |
49 | This proposal does not intend to address deleting/resetting offsets. As explained in the motivation section, stopping connectors has value on its own. Adding support for deleting/resetting connector offsets will be tackled in a separate proposal once Strimzi adopts a Kafka version that supports this feature (expected to release in 3.6).
50 |
51 | ## Affected/not affected projects
52 |
53 | This only affects `strimzi-kafka-operator`. This will be usable by regular connectors (via `KafkaConnectorSpec`) and by MirrorMaker connectors (via `KafkaMirrorMaker2ConnectorSpec`).
54 |
55 | ## Compatibility
56 |
57 | If `state` is not set, it will:
58 | - default to `running` if `pause` is `false` or if `pause` is not set
59 | - default to `paused` if `pause` is `true`
60 | But this will not be done as a change to the custom resource, it will just be handled that way internally.
61 |
62 | If `state` is set it will take precedence over the old `pause `field. If both fields are set, a warning will be emitted to notify the user of a potential conflict and a message will be added to the `condition` of the resource.
63 |
64 | ## Rejected alternatives
65 |
66 | In the original issue I proposed adding a `stop` property alongside the existing `pause` property. As explained these states are mutually exclusive so it does not make sense to have both.
67 |
--------------------------------------------------------------------------------
/057-run-zk-kraft-clusters-parallel.md:
--------------------------------------------------------------------------------
1 | # Allow running ZooKeeper and KRaft based clusters in parallel
2 |
3 | This proposal is about changing the way the current `UseKRaft` feature gate works.
4 | It changes the way the Strimzi Cluster Operator handles ZooKeeper and KRaft based clusters when it is enabled.
5 |
6 | ## Current situation
7 |
8 | Currently, when the `UseKRaft` feature gate is enabled (together with the required `KafkaNodePools` one), the operator expects the Apache Kafka clusters already running to be KRaft-based.
9 | It means that the `Kafka` custom resources are configured together with `KafkaNodePool`(s) with `broker` and `controller` roles.
10 | There is no way to differentiate between clusters running in ZooKeeper or KRaft mode.
11 | When a `Kafka` custom resource is configured to actually use ZooKeeper or it's just badly configured but supposed to be KRaft-based, the operator detects it as having a missing KRaft controllers configuration and logs the following warning:
12 |
13 | ```shell
14 | io.strimzi.operator.common.model.InvalidResourceException: The Kafka cluster my-cluster is invalid: [At least one KafkaNodePool with the controller role and at least one replica is required when KRaft mode is enabled]
15 | ```
16 |
17 | In this case, the reconciliation fails and the cluster is not operated at all.
18 | It means that the operator doesn't take any actions on changes applied by the user to the `spec.kafka` or `spec.zookeeper` in the `Kafka` custom resource.
19 |
20 | Furthermore, the `UseKRaft` feature gate was created for development and testing.
21 | As we move forward, it will be soon available for regular clusters and the current state is not going to be sufficient anymore.
22 |
23 | ## Motivation
24 |
25 | Leaving a ZooKeeper-based cluster not operated when the `UseKRaft` feature gate is enabled looks like a bad thing for the user, until the ZooKeeper support will be completely removed.
26 | Before that, the user should be able to have both ZooKeeper and KRaft based clusters running and operated when the `UseKRaft` feature gate is enabled.
27 | Also, taking into account the future ZooKeeper to KRaft migration process, as described in the proposal [PR#90](https://github.com/strimzi/proposals/pull/90), it would help the operator migration component to detect which cluster is ZooKeeper-based so "migrate-able" and which one is already KRaft-based.
28 |
29 | ## Proposal
30 |
31 | The proposal is about adding a new `strimzi.io/kraft` annotation to the `Kafka` custom resource, to be applied by the user, in order to identify a ZooKeeper or KRaft based cluster.
32 | The possible values would be:
33 |
34 | * `disabled`, missing or any invalid value: identifies a ZooKeeper-based cluster.
35 | * `enabled`: identifies a KRaft-based cluster.
36 |
37 | This way, during the reconciliation, the operator is able to "detect" a ZooKeeper-based cluster avoiding the warning and allowing the user to operate it.
38 | On the KRaft side, the annotation would be needed to have the operator reconciling the corresponding `Kafka` custom resource.
39 | Without the annotation, but the `UseKRaft` feature gate enabled, the operator would try to handle it as a ZooKeeper-based one.
40 |
41 | | Operator Feature Gate | `strimzi.io/kraft` annotation | Operator behaviour |
42 | |-----------------------|-------------------------------|--------------------------|
43 | | `-UseKRaft` | `enabled` | Ignore annotation |
44 | | `-UseKRaft` | missing or anything else | ZooKeeper reconciliation |
45 | | `+UseKRaft` | `enabled` | KRaft reconciliation |
46 | | `+UseKRaft` | missing or anything else | ZooKeeper reconciliation |
47 |
48 | The `UseKRaft` feature gate does not currently support any upgrades of KRaft clusters.
49 | It is expected to be used only for short-lived clusters used for development and testing.
50 | So no clusters are expected to exist.
51 | Therefore adding the annotation now does not present any backwards compatibility issues.
52 | Only ZooKeeper-based clusters are expected and newly created KRaft-based clusters having the `strimzi.io/kraft: enabled`.
53 | This approach is actually the same as for the node pools: enabling the `KafkaNodePools` feature gate on the operator is not enough and the user needs to apply the `strimzi.io/node-pools: enabled` annotation on the `Kafka` custom resource using node pools for brokers (and controllers, if KRaft is enabled as well).
54 |
55 | As a non-goal of this proposal, the same annotation could be even used for handling the ZooKeeper to KRaft migration steps as described in the proposal [PR#90](https://github.com/strimzi/proposals/pull/90).
56 |
57 | ## Affected/not affected projects
58 |
59 | The Strimzi Cluster Operator is the only project affected by this proposal.
60 | Its logic needs to be updated in order to be able to handle the new `strimzi.io/kraft` annotation (or the missing of it).
61 | No other projects in the Strimzi ecosystem are impacted by this proposal.
62 |
63 | ## Compatibility
64 |
65 | If the user is running the operator with `UseKRaft` feature gate enabled, it will detect ZooKeeper-based cluster allowing to operate them again.
66 | If the user has a KRaft-based cluster deployed, because of the missing annotation, the operator will try to handle it as it was ZooKeeper-based which would fail.
67 | In this case, if the user applies `strimzi.io/kraft: enabled` annotation on it, there could be unexpected results on the reconciliation. So doing this is not supported.
68 | In general, when this proposal is in place, the expectation is that there won't be KRaft-based clusters already running but there could be only ZooKeeper-based clusters running.
69 | Breaking this backward compatibility is expected by taking into account that the KRaft support in Strimzi is still behind a feature gate and should be considered just for development purposes.
70 |
71 | ## Rejected alternatives
72 |
73 | As first iteration of the proposal there are no rejected alternatives.
74 |
--------------------------------------------------------------------------------
/058-deprecate-and-remove-envvar-config-provider.md:
--------------------------------------------------------------------------------
1 | # Deprecate and remove EnvVarConfigProvider
2 |
3 | This proposes deprecating the [kafka-env-var-config-provider](https://github.com/strimzi/kafka-env-var-config-provider) project. It also proposes a timeline to archive the project and stop including the JAR in Strimzi releases.
4 |
5 | ## Current situation
6 |
7 | In [proposal 30](https://github.com/strimzi/proposals/blob/main/030-env-var-config-provider.md), we added an implementation of the [ConfigProvider](https://kafka.apache.org/35/javadoc/org/apache/kafka/common/config/provider/ConfigProvider.html) interface of Apache Kafka that allows retrieving configuration values at runtime from environment variables. This implementation lives in the `kafka-env-var-config-provider` project. This is especially useful to safely configure sensitive settings in virtualized/containerized environments like Kubernetes.
8 |
9 | For example, a Kafka client can use this provider with specified environment variables:
10 | ```properties
11 | config.providers=env
12 | config.providers.env.class=io.strimzi.kafka.EnvVarConfigProvider
13 | option1=${env:FIRST_ENV_VAR}
14 | ```
15 |
16 | ## Motivation
17 |
18 | Since Apache Kafka 3.5.0, there's a built-in `ConfigProvider` implementation that works with environment variables. It was added via [KIP-887](https://cwiki.apache.org/confluence/display/KAFKA/KIP-887%3A+Add+ConfigProvider+to+make+use+of+environment+variables). It provides the same functionality as Strimzi's implementation.
19 |
20 | Example usage:
21 | ```properties
22 | config.providers=env
23 | config.providers.env.class=org.apache.kafka.common.config.provider.EnvVarConfigProvider
24 | option1=${env:FIRST_ENV_VAR}
25 | ```
26 |
27 | ## Proposal
28 |
29 | Once Strimzi only supports Kafka versions >= 3.5.0, we should:
30 | - deprecate the [kafka-env-var-config-provider](https://github.com/strimzi/kafka-env-var-config-provider) project
31 | - update all references to `io.strimzi.kafka.EnvVarConfigProvider` by `org.apache.kafka.common.config.provider.EnvVarConfigProvider`. This includes code, examples, documentation, etc
32 |
33 | After 4 Strimzi releases have happened or Strimzi reached 1.0.0 (whichever comes first), we will archive the `kafka-env-var-config-provider` project and stop including the JAR in Strimzi releases.
34 |
35 | ## Affected/not affected projects
36 |
37 | This impacts:
38 | - [kafka-env-var-config-provider](https://github.com/strimzi/kafka-env-var-config-provider): To be deprecated and archived
39 | - [strimzi-kafka-operator](https://github.com/strimzi/strimzi-kafka-operator): Update all references to `io.strimzi.kafka.EnvVarConfigProvider` to use Kafka's implementation
40 |
41 | ## Compatibility
42 |
43 | Users relying on Strimzi's `EnvVarConfigProvider` will have to migrate to Kafka's implementation.
44 |
45 | ## Rejected alternatives
46 |
47 | Do nothing and keep maintaining our own environment variable ConfigProvider implementation.
48 |
--------------------------------------------------------------------------------
/062-UseKRaft-feature-gate-promotion.md:
--------------------------------------------------------------------------------
1 | # Promotion of the `UseKRaft` feature gate
2 |
3 | The `UseKRaft` feature gate allows users and developers of the Strimzi operator to deploy and manage a KRaft-based Kafka cluster.
4 | It was originally introduced in May 2022 in the [Strimzi Proposal #36 - KRaft support: ZooKeeper-less Kafka](https://github.com/strimzi/proposals/blob/main/036-kraft-mode.md).
5 | When introduced, KRaft mode in both Apache Kafka and Strimzi had significant limitations.
6 | Consequently, no specific timeline was established for the graduation of the feature gate.
7 | Currently, the feature gate remains in the _alpha_ stage and is disabled by default.
8 | While there are still some remaining limitations in Apache Kafka and Strimzi, they are less significant than they were when the original proposal was written.
9 | This proposal aims to create a plan for the graduation of the `UseKraft` feature gate and the changes related to it.
10 |
11 | ## Current limitations
12 |
13 | The current support for KRaft in Strimzi (and Apache Kafka) has the following major limitations:
14 | * Support for migration of ZooKeeper-based clusters to KRaft
15 | * Tracked in [strimzi/strimzi-kafka-operator#9433](https://github.com/strimzi/strimzi-kafka-operator/issues/9433), [strimzi/strimzi-kafka-operator#9447](https://github.com/strimzi/strimzi-kafka-operator/issues/9447), and [strimzi/strimzi-kafka-operator#9448](https://github.com/strimzi/strimzi-kafka-operator/issues/9448)
16 | * This is currently work in progress and waiting for Apache Kafka 3.7.0 release with several bug-fixes and implementation of [KIP-919](https://cwiki.apache.org/confluence/display/KAFKA/KIP-919%3A+Allow+AdminClient+to+Talk+Directly+with+the+KRaft+Controller+Quorum+and+add+Controller+Registration)
17 | * Support for the migration is critical for existing users and has to be done before support for ZooKeeper-based cluster is dropped.
18 | But missing support for migration does not affect any users who would want to deploy and run new clusters in KRaft mode.
19 | * This is currently expected to be implemented in Strimzi 0.40 or 0.41.
20 | * Scaling of KRaft controller nodes
21 | * Tracked in [strimzi/strimzi-kafka-operator#9429](https://github.com/strimzi/strimzi-kafka-operator/issues/9429)
22 | * Currently not supported are:
23 | * Scale-down of dedicated or mixed controller nodes that breaches the controller quorum
24 | * Scale-up of dedicated controller nodes (the scale-up eventually succeeds, but the broker availability is not maintained)
25 | * This is currently blocked by the support for [KIP-853](https://cwiki.apache.org/confluence/display/KAFKA/KIP-853%3A+KRaft+Controller+Membership+Changes).
26 | This KIP might not be implemented earlier than in Apache Kafka 4.0.
27 | Support for scaling controllers is currently not seen as blocker for moving the `UseKRaft` feature gate to _GA_.
28 | * JBOD support
29 | * Tracked in [strimzi/strimzi-kafka-operator#9437](https://github.com/strimzi/strimzi-kafka-operator/issues/9437)
30 | * Expected to be implemented in Apache Kafka 3.7.0.
31 | The Strimzi implementation will follow the release of Kafka 3.7.0 in Strimzi 0.40 or 0.41.
32 |
33 | ## Proposed timeline
34 |
35 | This proposal outlines the following timeline for the graduation of the `UseKRaft` feature gate:
36 | * Move to _beta_ phase and be enabled by default in Strimzi 0.40.0
37 | * Move to _GA_ phase and be permanently enabled in Strimzi 0.42.0
38 |
39 | It is worth noting that:
40 | * In addition to the `UseKRaft` feature gate, the KRaft clusters are enabled / disabled using the `strimzi.io/kraft` annotation.
41 | As a result, moving the `UseKRaft` feature gate to _beta_ or _GA_ does not mean that all new Kafka clusters have to use KRaft or that this has any impact on existing ZooKeeper-based clusters.
42 | KRaft will be applied only to the Kafka clusters with the right annotation.
43 | * Moving the `UseKRaft` feature to beta or GA does not mean we will drop support for ZooKeeper-based clusters.
44 | While moving the `UseKRaft` feature gate to _GA_ defines the earliest moment when support for ZooKeeper based clusters can be dropped, it is currently not expected to happen right after Strimzi 0.42 and this proposal does not establish any such plan.
45 |
46 | Moving the `UseKRaft` feature gate to _beta_ or _GA_ serves mainly the following objectives:
47 | * Signify the progress of the KRaft implementation and improvements in production-readiness.
48 | * Simplify running KRaft clusters by eliminating the need to enable any feature gate.
49 |
50 | ## Proposed changes
51 |
52 | Apart from the change to the feature gate status itself, this proposal also includes several other changes.
53 |
54 | ### CRD changes
55 |
56 | KRaft is usable only with the use of `KafkaNodePool` custom resources that configure the number of replicas and storage configuration.
57 | As part of promoting the `UseKRaft` feature gate to _beta_ stage, we will make the following fields in the `Kafka` custom resource optional instead of required:
58 | * `.spec.kafka.replicas`
59 | * `.spec.kafka.storage`
60 |
61 | In addition, the KRaft clusters do not need the ZooKeeper configuration.
62 | So the `.spec.zookeeper` section will be made optional as well.
63 |
64 | For ZooKeeper-based Kafka clusters, the validation of these fields will be done inside the Strimzi operator.
65 | Additionally, [CEL validation rules](https://kubernetes.io/docs/reference/using-api/cel/) will be considered as they might allow us to do additional validation on the Kubernetes level.
66 | The validation will check if the cluster is ZooKeeper-based and in that case require the fields mentioned above.
67 |
68 | For Kafka clusters using the node pools, a warning will be raised by the operator when the ignored `.spec.kafka.replicas` and `.spec.kafka.storage` fields are used.
69 | For KRaft-based clusters, a warning will be raised when `.spec.zookeeper` section is used.
70 |
71 | ### Safety check for existing clusters
72 |
73 | For existing clusters, a safety check will be implemented to prevent users from switching existing ZooKeeper-based clusters to Kraft-based cluster (or vice-versa) by mistake.
74 | This check will use the `.status.kafkaMetadataState` field in the `Kafka` custom resource to prevent any unintentional switching between ZooKeeper-based and Kraft-based clusters.
75 | Switching cluster management modes must be performed through a migration process.
76 | This field is already defined in [Strimzi Proposal #59 - ZooKeeper to KRaft migration](https://github.com/strimzi/proposals/blob/main/059-zk-kraft-migration.md).
77 | The check will compare the desired cluster type with existing type and if they do not match, it will throw an exception and end the reconciliation.
78 | It will be implemented as part of the migration work if it is shipped in Strimzi 0.40.0.
79 | Or separately if the migration is postponed to Strimzi 0.41.0.
80 |
81 | ### Examples
82 |
83 | The existing examples will be updated.
84 | We will also change the structure of the examples in the following way
85 | * The file `examples/kafka/nodepools/kafka.yaml` will be moved to `examples/kafka/kafka-with-nodepools.yaml`
86 | * The `examples/kafka/nodepools/` directory with the remaining files will be renamed to `examples/kafka/kraft/`
87 |
88 | ### Other changes
89 |
90 | The unit, integration, and system tests as well as the documentation will be updated to be in sync with this proposal as well.
91 |
92 | ## Affected projects
93 |
94 | This proposal affects only the Strimzi Cluster Operator.
95 |
96 | ## Backwards compatibility
97 |
98 | This proposal has no impact on backwards compatibility.
99 |
--------------------------------------------------------------------------------
/063-pdb-generation-environment-variable.md:
--------------------------------------------------------------------------------
1 | # Pod Disruption Budget Generation Environment Variable Proposal
2 |
3 | ## Background and Problem Statement
4 |
5 | The Strimzi Kafka operator currently lacks the option to disable the creation of Pod Disruption Budgets (PDBs). In certain infrastructures, PDB creations for users are denied, resulting in operational challenges. Providing an option to disable PDB creation in Strimzi would address these constraints.
6 |
7 | ## Proposed Solution
8 |
9 | Introduce a new environment variable in Strimzi, [similar to the one for Network Policies](https://github.com/strimzi/proposals/blob/main/028-network-policy-generation-environment-variable.md), that allows global disabling of PDB creation. This proposal suggests the environment variable `STRIMZI_POD_DISRUPTION_BUDGET_GENERATION`, which by default is `true`. When set to `false`, the Strimzi operator will not generate Pod Disruption Budgets.
10 |
11 | ## Rationale
12 |
13 | - **Operational Constraints:** In environments where the creation of Pod Disruption Budgets (PDBs) is not permitted, users face significant challenges in deploying Strimzi effectively. This limitation can hinder the adoption and utility of Strimzi in such environments.
14 |
15 | - **Configuration Flexibility:** Users require the flexibility to configure their Strimzi deployments in accordance with their specific operational policies and constraints. A rigid approach that mandates the creation of PDBs may not be compatible with all operational environments.
16 |
17 | - **Consistency with Existing Features:** Implementing this feature as an environment variable aligns with the existing approach used for Network Policy Generation in Strimzi. This consistency simplifies the understanding and adoption of the feature by existing users.
18 |
19 | - **Simplicity and Ease of Use:** Offering a global setting to disable PDB generation avoids the complexity and repetitive configuration that would be required if this setting was to be managed at the individual custom resource level.
20 |
21 | ## Current Implementation
22 |
23 | The Strimzi operator automatically creates PDBs to ensure high availability and minimize disruptions. However, this feature does not accommodate environments where PDB creation is restricted.
24 |
25 | ## Proposal Details
26 |
27 | 1. **Environment Variable Introduction:** Implement `STRIMZI_POD_DISRUPTION_BUDGET_GENERATION` with a default value of `true`.
28 | 2. **Disabling PDB Generation:** When set to `false`, the operator will skip PDB operations for all Strimzi components. It will not create, modify or delete any PDB.
29 | 3. **Documentation and Guidance:** Update Strimzi documentation to include instructions and implications of disabling PDB generation.
30 | 4. **Scope:** This change affects only the Pod Disruption Budget aspect and does not alter any other functionalities of the Strimzi operator.
31 |
32 | ## Affected Projects
33 |
34 | This proposal pertains solely to the [Strimzi Kafka Operator](https://github.com/strimzi/strimzi-kafka-operator).
35 |
36 | ## Compatibility
37 |
38 | - **Default Behavior:** By retaining `true` as the default value, existing deployments remain unaffected.
39 | - **Backward Compatibility:** This feature is an addition and does not alter existing functionalities.
40 |
41 | ## Rejected Alternatives
42 |
43 | 1. **CRD-based Option for PDB Disabling:** Initially considered, this was rejected for increasing complexity and requiring repetitive configuration.
44 | 2. **Global Disabling via Command-Line Flag in Strimzi Image Entrypoint:** Initially considered as an alternative, using a flag in the entrypoint command of the Strimzi image was ultimately deemed less flexible and consistent compared to an environment variable. This approach, while feasible, does not align as seamlessly with the configuration practices commonly used in Strimzi deployments.
45 |
46 | ## Conclusion
47 |
48 | Introducing an environment variable to globally control PDB generation in Strimzi provides the necessary flexibility for users operating in environments with strict PDB creation policies, while maintaining the ease of use and consistency with existing Strimzi features.
49 |
--------------------------------------------------------------------------------
/070-dont-fail-reconciliation-in-manual-rolling-update.md:
--------------------------------------------------------------------------------
1 | # Don't fail reconciliation when Manual Rolling Update fails
2 |
3 | This proposal addresses the Strimzi issue [strimzi/strimzi-kafka-operator#9654](https://github.com/strimzi/strimzi-kafka-operator/issues/9654).
4 |
5 | ## Current situation
6 |
7 | Today, we support manual rolling update for following operands:
8 | * ZooKeeper
9 | * Kafka brokers
10 | * Kafka Connect
11 | * Kafka MirrorMaker 2
12 |
13 | The manual rolling update is triggered by adding the `strimzi.io/manual-rolling-update="true"` annotation to a `Pod` or `StrimziPodSet` resource.
14 | And it is executed as part of the next reconciliation for a given operand.
15 | When the manual rolling update fails, then it fails the whole reconciliation.
16 |
17 | The manual rolling update is done as one of the first steps in the reconciliation process.
18 | The reason for doing it as early as possible is to do the manual rolling update with the original definition of the related Kubernetes resources (e.g. StrimziPodSets, PVCs, Services, ConfigMaps etc.).
19 | If rolling the pod later in the reconciliation, the related Kubernetes resources would have already changed due to unrelated updates to the Strimzi custom resources, which can lead to issues.
20 |
21 | ## Motivation
22 |
23 | There can be many different reasons why the manual rolling update might fail.
24 | For example:
25 | * Due to partition-replicas not being in-sync
26 | * Due to the pod not reaching a Ready state
27 |
28 | In some situations, the reason for the manual rolling update failing is a problem that would be fixed later in the reconciliation.
29 | But the operator never gets to it because the manual rolling update fails and that fails the whole reconciliation, so that it doesn't continue.
30 |
31 | One such example that we saw multiple times is described in the [strimzi/strimzi-kafka-operator#9654](https://github.com/strimzi/strimzi-kafka-operator/issues/9654) issue:
32 |
33 | 1. Due to a storage issue, one of the Kafka nodes (node X) is deleted including its PVC and PV
34 | 2. At the same time, another Kafka node (node Y) is annotated for manual rolling update (either by the user but possibly also by Drain Cleaner)
35 | 3. The StrimziPodSet controller will restart the failed pod X, but without the PVC/PV it will be in a Pending state
36 | 4. Next periodical reconciliation starts and tries to roll the annotated pod Y.
37 | But the rolling update fails because of the pod X being in Pending state and its partition replicas not being in-sync.
38 | Rolling the pod Y as requested by the annotation would break availability.
39 | As a result the manual rolling update fails and the whole reconciliation fails as well.
40 | 5. However, the PVC creation step is only after the manual rolling update in the reconciliation process.
41 | So the PVC is never recreated and the Pending pod X remains stuck.
42 | 6. When the next reconciliation happens, the same problem repeats again because these two events block each other.
43 |
44 | ## Proposal
45 |
46 | This proposal suggests to change the way the errors of manual rolling update are handled.
47 | Instead of failing the reconciliation when the manual rolling update fails, we should continue with the reconciliation.
48 | That would allow the operator to continue and possibly fix some of the issues.
49 | For example - in case of an issue such as the one described above - the reconciliation would recreate the missing PVC, which would allow the Pending Pod to start and sync-up the data and later allow the manual rolling update of the annotated Pod.
50 |
51 | Proceeding with the reconciliation might cause the related Kubernetes resources to change.
52 | So it in a way goes against the reason why the manual rolling update is done early in the reconciliation process:
53 |
54 | > The reason for doing it as early as possible is to do the manual rolling update with the original definition of the related Kubernetes resources (e.g. StrimziPodSets, PVCs, Services, ConfigMaps etc.).
55 | > If rolling the pod later in the reconciliation, the related Kubernetes resources would have already changed due to unrelated updates to the Strimzi custom resources, which can lead to issues.
56 |
57 | However, it provides a reasonable compromise in doing the manual rolling update as early as possible as a _best effort_, but not getting stuck with it forever.
58 |
59 | This change will be applied to all of the operands supporting manual rolling updates.
60 |
61 | ### `ContinueReconciliationOnManualRollingUpdateFailure` feature gate
62 |
63 | As this proposal changes the behavior of the Strimzi Cluster Operator, the change will be introduced though a feature gate.
64 | The feature gate will be named `ContinueReconciliationOnManualRollingUpdateFailure`.
65 | When this feature gate is enabled, a failure of the manual rolling update will not cause a failure of the whole reconciliation but the reconciliation will continue with a warning instead.
66 | When the feature gate is disabled, the reconciliation will fail with an error as it does today.
67 |
68 | This expected roadmap for the feature gate is as follows:
69 | * Introduced in Strimzi 0.41.0 as alpha level feature gate
70 | * Move to _beta_ phase and be enabled by default in Strimzi 0.43.0
71 | * Move to _GA_ phase and be permanently enabled in Strimzi 0.45.0
72 |
73 | ## Affected projects
74 |
75 | This proposal affects only the Strimzi Cluster Operator and the ZooKeeper, Kafka brokers, Kafka Connect and Kafka Mirror Maker 2 operands.
76 |
77 | ## Backwards compatibility
78 |
79 | This proposal does not change any Strimzi APIs but changes the behavior of the Strimzi Cluster Operator.
80 | A feature gate is used to introduce this change to minimize the impact on existing users.
81 |
82 | ## Rejected alternatives
83 |
84 | ### Doing the manual rolling update as part of the regular rolling update
85 |
86 | One of the considered alternatives was to merge the manual rolling update into the regular rolling update.
87 | That would also address the issue described in the [strimzi/strimzi-kafka-operator#9654](https://github.com/strimzi/strimzi-kafka-operator/issues/9654) issue.
88 | However, it could also create a new problems when an unrelated issue to the related Kubernetes resource (e.g. new load balancer not being provisioned) would block the manual rolling updates needed due to infrastructure disruptions such as node draining etc.
89 | Therefore this alternative was rejected.
90 |
--------------------------------------------------------------------------------
/071-deprecate-bridge-openapi-2.md:
--------------------------------------------------------------------------------
1 | # Deprecate and remove OpenAPI v2 (Swagger) support on the Strimzi HTTP bridge
2 |
3 | This proposal is about deprecating the [OpenAPI v2](https://swagger.io/specification/v2/) (Swagger) specification support on the Strimzi HTTP bridge.
4 | It also proposes a plan to remove such support after its deprecation across different bridge releases.
5 | The deprecation and removal will leave the bridge supporting only the [OpenAPI v3](https://spec.openapis.org/oas/latest.html) specification.
6 |
7 | ## Current situation
8 |
9 | Currently, the codebase provides two JSON files describing the HTTP endpoints exposed by the bridge.
10 | The [`openapiv2.json`](https://github.com/strimzi/strimzi-kafka-bridge/blob/main/src/main/resources/openapiv2.json) uses OpenAPI v2.
11 | The [`openapi.json`](https://github.com/strimzi/strimzi-kafka-bridge/blob/main/src/main/resources/openapi.json) uses OpenAPI v3.
12 | In reality, the `openapi.json` file is used internally to "load" the HTTP endpoints definition via the Vert.x Web OpenAPI component in order to build the web routes.
13 | The HTTP endpoints specification via OpenAPI is also used by the Vert.x Web OpenAPI component to validate parameters and body on the incoming HTTP requests.
14 | The `openapiv2.json` is just used to be returned as resource when an HTTP client issues a request on the `/openapi` HTTP endpoint.
15 | Exposing the OpenAPI specification via a dedicated HTTP endpoint is useful to external systems like API gateways or tools for API testing and for clients code auto-generation.
16 |
17 | ## Motivation
18 |
19 | The bridge has been exposing the HTTP endpoints definition via the OpenAPI v2 specification for supporting external systems and tools still using Swagger.
20 | Internally, it has always been using the OpenAPI v3 to "load" the HTTP endpoints definition, build the corresponding web routes and validate the parameters and body on the incoming HTTP requests.
21 | The OpenAPI v2 specification can be considered obsolete as the latest [release](https://swagger.io/specification/v2/) happened 10 years ago.
22 | Most of the API gateways, clients and tools are now supporting the OpenAPI v3 specification.
23 | Furthermore, every time there are changes in the HTTP endpoints definition, we need to keep the two JSON files in sync, because they are used both for different purposes as explained before.
24 |
25 | ## Proposal
26 |
27 | The proposal is about deprecating the OpenAPI v2 specification support in the next Strimzi HTTP bridge 0.29.0 release and removing it in the first major or minor release of 2025.
28 | To make the transition smoothly, the idea is to have two new HTTP endpoints:
29 |
30 | * `/openapi/v2`: still exposing the bridge HTTP endpoints definition with the OpenAPI v2 specification.
31 | * `/openapi/v3`: exposing the bridge HTTP endpoints definition with the OpenAPI v3 specification.
32 |
33 | During the deprecation period, starting with the 0.29.0 release, the HTTP endpoints definition will be available with both OpenAPI v2 and v3 specification on the two different endpoints.
34 | Any HTTP request issued to the current `/openapi` endpoint will be forwarded to the `/openapi/v2` endpoint, still with the OpenAPI v2 specification.
35 |
36 | At the end of the deprecation period, with the first major or minor release of 2025, the `/openapi/v2` will be handled to return the `410 Gone` HTTP status code instead.
37 | Any HTTP request issued to the `/openapi` endpoint will be forwarded to the `/openapi/v3`, with the OpenAPI v3 specification.
38 |
39 | ## Affected/not affected projects
40 |
41 | The Strimzi HTTP bridge is the only project to be affected by this proposal.
42 |
43 | ## Compatibility
44 |
45 | During the deprecation period, the compatibility with external systems and tools using the OpenAPI v2 specification is guaranteed with the newly added `/openapi/v2` endpoint and the current `/openapi` forwarding to it as described before.
46 | Of course, after the removal, the bridge won't be compatible with OpenAPI v2 specification anymore.
47 |
48 | ## Rejected alternatives
49 |
50 | One alternative was to deprecate with the 0.29.0 release and remove with the future 0.30.0, but it was rejected as not giving enough time to the users to adapt, also taking into account the effort for the implementation.
--------------------------------------------------------------------------------
/072-kafkabrige-consumer-producer.md:
--------------------------------------------------------------------------------
1 | # Enhance KafkaBridge resource with consumer inactivity timeout and HTTP consumer/producer parts enablement
2 |
3 | Providing support in the Strimzi Kubernetes Operator for following properties supported in the Strimzi HTTP Kafka Bridge:
4 | * `http.consumer.timeoutSeconds` - For deleting inactive consumers after a timeout (disabled by default).
5 | * `http.consumer.enabled` - To enable/disable the HTTP consumer part (enabled by default).
6 | * `http.producer.enabled` - To enable/disable the HTTP producer part (enabled by default).
7 |
8 | ## Current situation
9 |
10 | Properties are not yet supported in the Strimzi Kubernetes Operator.
11 |
12 | ## Motivation
13 |
14 | Raised in the discussion here: https://github.com/strimzi/strimzi-kafka-operator/issues/8732 and triaged on 29.6.2023: We should enable the configuration of these fields. But how should the API look like? Should have a proposal to clarify the API changes.
15 |
16 | ## Proposal
17 |
18 | This proposal suggests based on discussions here: https://github.com/strimzi/strimzi-kafka-operator/pull/9820 to add the http enablement for the consumer and producer in their respective sessions, i.e. `spec.consumer.enabled` and `spec.producer.enabled`, as well as `spec.consumer.timeoutSeconds` property.
19 |
20 | Suggestion:
21 |
22 | ```yaml
23 | apiVersion: "kafka.strimzi.io/v1beta2"
24 | kind: "KafkaBridge"
25 | metadata:
26 | name: "test-kafka-bridge"
27 | spec:
28 | replicas: 1
29 | image: "my-test-image"
30 | bootstrapServers: "my-cluster-kafka:9092"
31 | consumer:
32 | enabled: true
33 | timeoutSeconds: 60
34 | config:
35 | foo: "bar"
36 | producer:
37 | enabled: false
38 | config:
39 | foo: "buz"
40 | enableMetrics: false
41 | ```
42 |
43 | ## Affected/not affected projects
44 |
45 | Strimzi Kafka Bridge - A PR has been raised for the HTTP Bridge to use the new ENV parameters introduced by the operator: https://github.com/strimzi/strimzi-kafka-bridge/pull/882
46 |
47 | ## Compatibility
48 |
49 | Not specifying `spec.consumer.enabled` or `spec.producer.enabled` implies not configuring the corresponding bridge parameters `http.consumer.enabled` and `http.producer.enabled`, which default to true, thus there are no impacts on backward compatibility.
50 |
51 | Not specifying `spec.consumer.timeoutSeconds` implies not configuring the corresponding bridge parameter `http.consumer.timeoutSeconds`, which defaults to -1 to have the same effect (no timeout), thus there are no impacts on backward compatibility.
52 |
53 | ## Rejected alternatives
54 |
55 | ### 1.
56 | Initially in the implementation suggestion: https://github.com/strimzi/strimzi-kafka-operator/pull/9820, the enabled properties for producer and consumer were added to as part of the http properties, i.e. `spec.http.consumer.enabled` and `spec.http.producer.enabled`.
57 |
58 | Alternative suggestion:
59 |
60 | ```yaml
61 | apiVersion: "kafka.strimzi.io/v1beta2"
62 | kind: "KafkaBridge"
63 | metadata:
64 | name: "test-kafka-bridge"
65 | spec:
66 | replicas: 1
67 | image: "my-test-image"
68 | bootstrapServers: "my-cluster-kafka:9092"
69 | http:
70 | timeoutSeconds: 60
71 | producer:
72 | enabled: false
73 | consumer:
74 | enabled: true
75 | consumer:
76 | config:
77 | foo: "bar"
78 | producer:
79 | config:
80 | foo: "buz"
81 | enableMetrics: false
82 | ```
83 |
--------------------------------------------------------------------------------
/080-deprecation-and-removal-of-storage-overrides.md:
--------------------------------------------------------------------------------
1 | # Deprecation and removal of Storage overrides
2 |
3 | Currently, when configuring persistent-volume-claim storage, users can use per-broker overrides to override the storage class used by the individual brokers.
4 | This feature should not be needed anymore with Kafka Node Pools and should be deprecated and removed.
5 |
6 | ## Current situation
7 |
8 | Currently, users can override the storage class used by persistent-volume-claim storage on a per-broker basis.
9 | The following example:
10 |
11 | ```yaml
12 | storage:
13 | type: jbod
14 | volumes:
15 | - id: 0
16 | type: persistent-claim
17 | size: 100Gi
18 | deleteClaim: false
19 | class: my-storage-class
20 | overrides:
21 | - broker: 0
22 | class: my-storage-class-zone-1a
23 | - broker: 1
24 | class: my-storage-class-zone-1b
25 | - broker: 2
26 | class: my-storage-class-zone-1c
27 | ```
28 |
29 | Would create a cluster where:
30 | * Broker with node ID 0 uses the storage class `my-storage-class-zone-1a`
31 | * Broker with node ID 1 uses the storage class `my-storage-class-zone-1b`
32 | * Broker with node ID 2 uses the storage class `my-storage-class-zone-1c`
33 | * All other brokers use storage class `my-storage-class`
34 |
35 | Using a different storage class per broker can be useful in some special use-cases.
36 | For example when you need to guide the scheduling of the different brokers to different zones / nodes.
37 | But this kind of configuration has some challenges as well.
38 | Unless the broker ID is listed in the overrides, it will use the default storage class.
39 | So when you need to scale-up your Kafka cluster, you have to make sure it has the entries for the node IDs that will be added.
40 |
41 | ## Motivation
42 |
43 | The overrides were very useful while all nodes were configured in the `Kafka` custom resource.
44 | But the Kafka Node Pools features was designed with the idea of using different configurations for different sets of nodes from the beginning.
45 | Users using node pools can have multiple different node pools - each representing a subset of Kafka nodes - with different configuration.
46 | This includes a different storage class.
47 | It also does need the user to specify exact node IDs as the storage configuration will be used for all nodes belonging to a given node pool.
48 | So it simplifies the configuration.
49 |
50 | In addition to simplifying the configuration for the users, it will also allow us to streamline the code and the API.
51 | In particular, the _storage diffing_ needed to avoid undesired changes will be simplified.
52 |
53 | ## Proposal
54 |
55 | The persistent-volume-claim storage overrides will be deprecated immediately in Strimzi 0.43.
56 | The will remain supported and used, but users would be encouraged to move away from them and use Kafka Node Pools instead.
57 | This also falls into the time period when we expect most users to migrate to node pools and to KRaft which provides a good opportunity to move to per-node-pool storage class instead of using the overrides.
58 |
59 | Later, in the Strimzi version where support for ZooKeeper-based Kafka clusters is removed, the support for the storage overrides will be dropped and the overrides will be ignored.
60 | The overrides will remain in the CRD API for backwards compatibility but will not be used by the operator code.
61 | Finally, when moving the new `v1` CRD API in the future, the fields will be completely removed.
62 |
63 | Kubernetes do not allow changing storage class for existing persistent volumes.
64 | If any user doesn't migrate from the overrides in time, the existing Kafka nodes and persistent volumes will not be affected.
65 | Only when the PVC/PV is deleted or when new nodes will be added during a scale-up, the overrides will be ignored and the default storage class will be used.
66 | So the impact on existing users should be minimal.
67 |
68 | ### Warnings
69 |
70 | With the deprecation of the storage overrides, the operator will be updated to issue warnings when the overrides are used.
71 | The warnings will be printed in regular logs from the Cluster Operator.
72 | And they will be also added to the conditions in the `Kafka` CR status.
73 |
74 | ## Affected projects
75 |
76 | This proposal affects the Strimzi Cluster Operator only.
77 |
78 | ## Backwards compatibility
79 |
80 | This proposal removes the support for storage overrides in Kafka clusters.
81 | It will impact all users using it and they will have to migrate to node pools with different storage classes.
82 | Kafka clusters not using the overrides and any other operands supported by Strimzi will not be impacted in any way.
83 |
84 | ## Rejected alternatives
85 |
86 | ### Different timeline
87 |
88 | The timeline between the deprecation of the overrides and the removal of their support is relatively short (likely 3-4 Strimzi versions).
89 | If we think this is a problem, we can choose a different timeline.
90 | For example, we can deprecate the overrides right now, but postpone when we drop the support for them:
91 | * Drop the support only at a later version (e.g. first Strimzi release after June 2025)
92 | * Drop the support only when we migrate to the `v1` CRD API
93 |
94 | This might give users more time to handle it.
95 | However, while the migration time window is relatively short, I believe it is sufficient for most existing users, as it includes the migration to KRaft and node pools, and provides a good opportunity to remove this legacy feature.
96 |
--------------------------------------------------------------------------------
/082-moving-data-between-two-jbod-disks-using-cruise-control.md:
--------------------------------------------------------------------------------
1 | # Moving data between two JBOD disks using Cruise Control
2 |
3 | This proposal is about integrating the [`remove_disks`](https://github.com/linkedin/cruise-control/blob/main/cruise-control/src/main/resources/yaml/endpoints/removeDisks.yaml) endpoint from Cruise Control into Strimzi cluster operator.
4 | This endpoint will allow us to move the data between two JBOD disks.
5 |
6 | ## Current situation
7 |
8 | Currently, we get a multiple requests from community users to add the ability for moving all Kafka logs between two disks on the JBOD storage array. This feature can be useful in following scenarios:
9 | - The current disk is too small and the user wants to use a bigger one, or vice versa.
10 | - When we want to use a different Storage Class with different parameters or different storage types.
11 | - In case of disk removal to reduce the total storage.
12 |
13 | For now, we can do this using the Kafka CLI `kafka-reassign-partitions.sh` tool, but it takes a lot of manual steps which is time-consuming and not so user-friendly.
14 |
15 | ## Motivation
16 |
17 | We should introduce the logic to Strimzi to leverage Cruise Control integration and make it possible to move the data between two JBOD disks.
18 | This feature will also allow us to remove the disks without the loss of data.
19 |
20 | ## Proposal
21 |
22 | Cruise Control provides the `remove_disks` HTTP REST endpoint to move replicas from a specified disk to other disks for the same broker. The operation is only for intra-broker rebalancing, not moving data between brokers.
23 | This endpoint triggers a rebalancing operation that moves replicas, starting with the largest and proceeding to the smallest, to the remaining disks while ensuring the following constraint is met:
24 | ```sh
25 | 1 - (remainingUsageAfterRemoval / remainingCapacity) > errorMargin
26 | ```
27 | where:
28 | ```sh
29 | remainingUsageAfterRemoval = current usage for remaining disks + additional usage from removed disks
30 | remainingCapacity = sum of capacities of the remaining disks
31 | errorMargin = configurable property (default 0.1); it makes sure that a disk percentage is always free when moving replicas
32 | ```
33 |
34 | To use the `remove_disks` endpoint in the Strimzi cluster operator, it should be added to the [`CruiseControlApi`](https://github.com/strimzi/strimzi-kafka-operator/blob/main/cluster-operator/src/main/java/io/strimzi/operator/cluster/operator/resource/cruisecontrol/CruiseControlApi.java) interface, and the corresponding implementation developed.
35 |
36 | ### Implementation
37 |
38 | To implement this feature, we will be adding a new mode to the `KafkaRebalanceMode` class.
39 | * `remove-disks`: It moves replicas from a specified disk to other disks of the same broker. It always uses intra-broker re-balancing.
40 | You can use this mode by changing the `spec.mode` to `remove-disks` in the `KafkaRebalance` resource.
41 |
42 | A `KafkaRebalance` custom resource would look like this.
43 |
44 | ```yaml
45 | apiVersion: kafka.strimzi.io/v1beta2
46 | kind: KafkaRebalance
47 | metadata:
48 | name: my-rebalance
49 | labels:
50 | strimzi.io/cluster: my-cluster
51 | spec:
52 | # setting the mode as `remove-disks` to move data between the JBOD disks
53 | mode: remove-disks
54 | # providing the list of brokers, and the corresponding volumes from which you want to move the replicas
55 | moveReplicasOffVolumes:
56 | - brokerId: 0
57 | volumeIds: [1, 2]
58 | - brokerId: 2
59 | volumeIds: [1]
60 | # ...
61 | ```
62 |
63 | ### Flow
64 |
65 | - The user should be using the `Kafka` resource with JBOD configured, making sure that they have more than one disk configured on the brokers.
66 | - When the Kafka cluster is ready, the user creates a `KafkaRebalance` custom resource with the `spec.mode` field as `remove-disks` and provides a list of the brokers, and the corresponding volumes from which they want to move the replicas in the `spec.moveReplicasOffVolumes` field. In case, the `spec.moveReplicasOffVolumes` field is not set, then the `KafkaRebalance` resource will move to `NotReady` state prompting that `spec.moveReplicasOffVolumes` field is missing.
67 | - The `KafkaRebalanceAssemblyOperator` interacts with Cruise Control via the `/remove_disks` endpoint to generate an optimization proposal (by using the dryrun feature).
68 | - You can use `strimzi.io/rebalance-auto-approval:true` annotation on the `KafkaRebalance` resource for auto-approval of proposal. In case you want to do it manually you can do it by applying the `strimzi.io/rebalance=approve` annotation on it.
69 | - The `KafkaRebalanceAssemblyOperator` interacts with Cruise Control via the `/remove_disks` endpoint to perform the actual rebalancing.
70 |
71 | > **NOTE** The optimization proposal will not show the load before optimization, it will only show the load after optimization. This is because in upstream Cruise Control we don't have the verbose tag enabled with the `remove_disks` endpoint.
72 |
73 | ### Other Scenarios
74 |
75 | - In case the user is not using JBOD storage and tries to generate the optimization proposal, the `KafkaRebalance` resource will move to `NotReady` state prompting invalid log dirs provided for the broker.
76 | - If you are using JBOD with single disk configured on the brokers, in that case `KafkaRebalance` will move to `NotReady` state prompting that you don't have enough log dirs to move the replicas for that broker.
77 | - If the disk capacity has exceeded for the broker, in that case `KafkaRebalance` will move to `NotReady` prompting that enough capacity is not remaining to move replicas for that broker.
78 | - This feature works fine with `KafkaNodePool` resources.
79 | - This feature works with KRaft only if Kafka version is greater than 3.7.0, as that version supports multiple JBOD disks on brokers.
80 |
81 | Errors for these scenarios are reported by Cruise Control.
82 | Based on these errors, we transition the `KafkaRebalance` resource to the `NotReady` state and update its status with the corresponding error message.
83 |
84 | ## Affected/not affected projects
85 |
86 | This change impacts the Cruise Control API related classes and the `KafkaRebalanceAssemblyOperator` class.
87 |
88 | ## Rejected alternatives
89 |
90 | No rejected alternatives.
91 |
--------------------------------------------------------------------------------
/086-archive-canary.md:
--------------------------------------------------------------------------------
1 | # Archive Canary project
2 |
3 | This proposal is about archiving the [Canary project](https://github.com/strimzi/strimzi-canary).
4 |
5 | ## Current situation
6 |
7 | The Canary project, written in Go programming language, is a component for monitoring a Kafka cluster.
8 | It provides functionality to periodically check the availability of the Kafka cluster during normal run, upgrades, downgrades, and rolling updates.
9 | That is done by connection check to the cluster, but also producer and consumer, that periodically do the message transmission between Kafka cluster and Canary.
10 | The Canary then provides Prometheus metrics and alerts for users or cluster admins to react on the Kafka cluster issues.
11 |
12 | ## Motivation
13 |
14 | The Canary, as mentioned, is written in Go, however the Strimzi organization (and the engineers working on the projects inside this organization) is focused on using the Java programming language.
15 | This means the team is missing the necessary expertise in Go in order to solve various issues in Canary.
16 | Based on the previous proposal from PR [#58](https://github.com/strimzi/proposals/pull/58), the Sarama Kafka client library lacks features that are available in the official
17 | Java Kafka client library - and the Sarama library is not the official Kafka library in comparison to the Java one.
18 | Additionally, there are multiple [issues created in the Canary project](https://github.com/strimzi/strimzi-canary/issues) that have remained with comment or unresolved for a significant amount of time.
19 | The project's dependencies haven't been updated for two years now, meaning that there can be a lot of CVEs, unresolved issues with newer Kafka versions, and so on.
20 | The inactivity of the project shows that we do not have the time to continue developing Canary, which is also the main reason for creating this proposal.
21 |
22 | Because of these issues, I'm proposing to archive the Canary project.
23 |
24 | ## Canary in Java
25 |
26 | After few releases of Canary we realized that there is not enough expertise for the Go language and Sarama Kafka client doesn't provide functionality we need,
27 | so the [PR proposing to move Canary to Java](https://github.com/strimzi/proposals/pull/58) was created.
28 | The proposal contained all issues with the current implementation together with proposed implementation and changes for the Canary in Java.
29 | In parallel with the proposal, the POC was written in Java and is available in the [im-konge/canary-java](https://github.com/im-konge/canary-java) repository.
30 | However, during the implementation process, we found out that few things are not possible using Java Kafka clients (for example the connection check that was one of the main features of Canary) and that
31 | metrics related to Sarama client are not relevant anymore.
32 | Changes like this would break the backwards compatibility, meaning that it would not be 1:1 copy of the Canary in Go.
33 |
34 | Other than that, we thought about changing the metrics provided by Canary to be more insightful, but we were unable to reach a consensus on the specific metrics to include.
35 | We had discussions with community users about how the Canary can be more useful to them, but the users ended up writing their own Canary-like tool
36 | with functionality useful to them.
37 |
38 | Because we didn't move forward with the proposal and agreement on the future of the Canary, after more than one year from the moment the proposal was created, we decided to close the
39 | proposal, and we agreed to archive the project on [community meeting on May 30th 2024](https://youtu.be/UpStul__uCw?si=GTA5edXJEnGgxP1a).
40 | If the community finds Canary valuable, provides feedback on how to improve its metrics and functionality, and we have sufficient capacity and resources, we will consider proposing a new project to rewrite Canary in Java.
41 |
42 | ## Proposal
43 |
44 | After this proposal is approved, we should:
45 | - archive the Canary project
46 | - remove the Canary install files from Strimzi operators repository
47 | - now from the `/packaging/install` folder
48 | - as part of the next Strimzi release, the installation files will be removed from the `/install` folder
49 | - delete the mentions about the Canary project from the documentation and automation files (Makefiles)
50 | - inform users about archiving the project
51 |
52 | If the community:
53 |
54 | - finds Canary useful
55 | - provides additional information about how to improve its metrics or functionality
56 | - and we have the capacity to maintain the project
57 |
58 | we can consider proposing a new project to rewrite Canary in Java.
59 |
60 | ## Affected/not affected projects
61 |
62 | The only affected project is the [Canary](https://github.com/strimzi/strimzi-canary) that should be archived.
63 | In the Strimzi operators repository, the only affected parts are the [installation files](https://github.com/strimzi/strimzi-kafka-operator/tree/main/install/canary) and
64 | [in development installation files](https://github.com/strimzi/strimzi-kafka-operator/tree/main/packaging/install/canary) that should be deleted by this proposal.
65 |
66 | ## Compatibility
67 |
68 | The backwards compatibility is not relevant in this case, as the project will be archived and there is no other solution currently that would replace it.
69 |
70 | ## Rejected alternatives
71 |
72 | ### Maintaining Canary in Go and providing additional functionality
73 |
74 | One of the rejected alternative was to keep updating the Canary in Go and add more functionality to it.
75 | This was discussed and rejected because of:
76 | - lack of Go experts in the Strimzi organization
77 | - resources - it would take a lot of time to learn Go and properly testing every new change without knowing how it will work
78 | - missing functionality in the Sarama Kafka client
79 |
80 | ### Maintaining Canary for dependency updates
81 |
82 | Another alternative was to keep Canary project and updating the dependencies.
83 | Some of the dependency updates can be without breaking changes, but from time to time there are changes in the dependencies that requires additional
84 | changes to the code, which brings us to the same situation as the previous alternative - someone would need to do the changes to the code,
85 | test it properly and then release it.
86 | Because of these issues, we decided to reject this alternative.
87 |
88 | ### Rewrite Canary in Java
89 |
90 | A final alternative was to rewrite Canary in Java, but as was mentioned in the [Canary in Java](#canary-in-java), it would not be the same Canary as in Go,
91 | and we didn't agree on how the new metrics of the Canary in Java (and the overall implementation) should look like.
92 | Because of this, we decided to reject this alternative.
--------------------------------------------------------------------------------
/088-support-mounting-of-CSI-volumes.md:
--------------------------------------------------------------------------------
1 | # Support for mounting CSI volumes
2 |
3 | Proposal [75 - Support for additional volumes](https://github.com/strimzi/proposals/blob/main/075-additional-volumes-support.md) introduced the possibility to mount additional volumes into Strimzi operand Pods.
4 | It added support for the following volumes types:
5 | * Secrets
6 | * Config Maps
7 | * EmptyDir volumes
8 | * Persistent Volume Claims
9 |
10 | This proposal follows up on it and proposes adding support for CSI volumes.
11 |
12 | ## Motivation
13 |
14 | Mounting Persistent Volume Claims is useful for providing additional data volumes, for example, to store logs or for tiered storage.
15 | EmptyDir volumes are useful as a temporary storage.
16 | Finally, Kubernetes Secrets or Config Maps are useful for providing additional configuration data or credentials.
17 |
18 | But in some cases, Kubernetes Secrets might not be the ideal method for storing credentials.
19 | Users might prefer to use other mechanisms for loading credentials, such as using specialized CSI drivers to mount them directly.
20 | For example:
21 | * [cert-manager CSI Driver](https://cert-manager.io/docs/usage/csi/) can be used to mount certificates or SPIFFE (Secure Production Identity Framework for Everyone) identities
22 | * [Secret Store CSI Driver](https://secrets-store-csi-driver.sigs.k8s.io/introduction) for mounting secrets from enterprise-grade secret stores such as Vault, AWS Secret Manager, etc.
23 |
24 | CSI volumes can be also used to directly mount data volumes without needing to use Persistent Volume Claims as the _intermediaries_.
25 | While this might be useful in some cases, the main goal of this proposal is on specialized types of CSI drivers, such as those mentioned above, rather than data volumes.
26 | However, nothing will prevent users from using this feature to mount data volumes as well (and there is no reason to prevent such use).
27 |
28 | ## Proposal
29 |
30 | In order to support the CSI volumes, a new field named `csi` will be added to the [`AdditionalVolume` class](https://github.com/strimzi/strimzi-kafka-operator/blob/87935da1fae794bab473a0470cbea214369ac985/api/src/main/java/io/strimzi/api/kafka/model/common/template/AdditionalVolume.java#L41).
31 | This field will use the Fabric8 type `CSIVolumeSource` and map to the [Kubernetes `CSIVolumeSource` structure](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.25/#csivolumesource-v1-core).
32 | This would allow users to define the CSI volumes in the container template fields.
33 | For example, to mount a Cert Manager certificate in Kafka Connect, users can use the following YAML:
34 |
35 | ```yaml
36 | template:
37 | connectContainer:
38 | volumeMounts:
39 | - name: certificate
40 | mountPath: /mnt/certificate/
41 | pod:
42 | volumes:
43 | - name: certificate
44 | csi:
45 | driver: csi.cert-manager.io
46 | readOnly: true
47 | volumeAttributes:
48 | csi.cert-manager.io/issuer-name: my-ca
49 | csi.cert-manager.io/dns-names: ${POD_NAME}.${POD_NAMESPACE}.svc.cluster.local
50 | ```
51 |
52 | This configuration uses the cert-manager CSI driver to generate a new certificate with the `my-ca` Issuer, and mount it in the `/mnt/certificate/` path.
53 | The `dns-names` attribute specifies the DNS names the certificate will be requested for.
54 | The `readOnly: true` flag specifies that this volume is read-only.
55 |
56 | ## Affected projects
57 |
58 | This proposal affects the Strimzi Cluster Operator only.
59 |
60 | ## Backwards compatibility
61 |
62 | There is no impact on backwards compatibility.
63 |
64 | ## Rejected alternatives
65 |
66 | There are currently no rejected alternatives.
67 |
--------------------------------------------------------------------------------
/089-adopt-connect-health-endpoint.md:
--------------------------------------------------------------------------------
1 | # Adopt the Kafka Connect health check endpoint
2 |
3 | In a Kubernetes node, the kubelet component uses the configured liveness probe to know when to restart a container.
4 | With a suitably written application this can be used to restart a container if the applications process enters an illegal state (e.g. deadlock).
5 | Additionally, it uses the configured readiness probe to know when a container is ready to start accepting traffic.
6 |
7 | A Kafka Connect health check HTTP endpoint is available since Kafka 3.9.0 release ([KIP-1017](https://cwiki.apache.org/confluence/display/KAFKA/KIP-1017%3A+Health+check+endpoint+for+Kafka+Connect)).
8 | This proposal describes a possible approach for adopting the health check endpoint for Kafka Connect and MirrorMaker 2 components.
9 |
10 | ## Current situation
11 |
12 | Kafka Connect provides a REST API for managing connectors.
13 | Strimzi users can deploy Kafka Connect in distributed mode by simply creating a `KafkaConnect` resource.
14 | A `KafkaMirrorMaker2` resource can be used to mirror data between Kafka clusters, and it is reconciled reusing the Kafka Connect logic.
15 | Both of these components use the `/` HTTP (root) endpoint for liveness and readiness probes on port 8083 (rest-api).
16 |
17 | This is the default HTTP probe configuration shared by both components:
18 |
19 | | Property name | Default value | Description |
20 | |---------------------|---------------|----------------------------------------------------------------------------------------------|
21 | | initialDelaySeconds | 60 | The initial delay before first the health is first checked. |
22 | | timeoutSeconds | 5 | The timeout for each attempted health check. |
23 | | periodSeconds | 10 | How often to perform the probe. |
24 | | successThreshold | 1 | Minimum consecutive successes for the probe to be considered successful after having failed. |
25 | | failureThreshold | 3 | Minimum consecutive failures for the probe to be considered failed after having succeeded. |
26 |
27 | Strimzi does not support the HTTPs protocol for Kafka Connect REST API ([KIP-208](https://cwiki.apache.org/confluence/display/KAFKA/KIP-208%3A+Add+SSL+support+to+Kafka+Connect+REST+interface)).
28 |
29 | Example output with `/` endpoint (probes only care about the HTTP response status code):
30 |
31 | ```sh
32 | $ kubectl exec my-cluster-kafka-0 -- curl -s http://my-connect-cluster-connect-api:8083 | jq
33 | {
34 | "version": "3.9.0",
35 | "commit": "a60e31147e6b01ee",
36 | "kafka_cluster_id": "_xIIeVIOQMKXimv_l96WtQ"
37 | }
38 | ```
39 |
40 | ## Motivation
41 |
42 | Using the `/` endpoint for Kafka Connect and MirrorMaker 2 health checks is a common approach, but it does not actually test for readiness, because these requests can be completed before the worker is started.
43 | Instead, the `/health` endpoint waits for the worker startup phase to complete, which is made of internal topics creation if they do not exists, internal topics full read, and cluster join.
44 |
45 | If the worker has not yet completed the startup phase, or it is unable to respond in time, the response will have a 5xx status code.
46 | All Kafka Connect endpoints have a timeout of 90 seconds, with the exception of the `/health` endpoint where it is hardcoded to 10 seconds for faster unhealty worker detection.
47 | Unlike the `/` endpoint, the `/health` endpoint response message includes error details that can help with troubleshooting.
48 |
49 | Example output with `/health` endpoint (probes only care about the HTTP response status code):
50 |
51 | ```sh
52 | $ kubectl exec my-cluster-kafka-0 -- curl -s http://my-connect-cluster-connect-api:8083/health | jq
53 | {
54 | "status": "healthy",
55 | "message": "Worker has completed startup and is ready to handle requests."
56 | }
57 | ```
58 |
59 | ## Proposal
60 |
61 | Strimzi currently supports both Kafka 3.8 and 3.9 releases.
62 |
63 | As the `/health` endpoint was introduced in Kafka 3.9, we will wait for Kafka 3.8 to go out of support in Strimzi before doing the switch.
64 | This will happen in Strimzi 0.46 release where we will also have ZooKeeper removal and Kafka 4.0 support.
65 | A notable change will be added to the changelog to inform the users.
66 |
67 | The configuration options for liveness and readiness probes won't change.
68 |
69 | ## Affected/not affected projects
70 |
71 | The only affected project is the Cluster Operator, in particular Kafka Connect and MirrorMaker 2 components.
72 |
73 | ## Compatibility
74 |
75 | This change is backwards compatible, and there should be no need to update Kafka Connect and MirrorMaker 2 probe configurations.
76 |
77 | The following test results show that there isn't a significant difference in performance between the `/health` and `/` endpoints.
78 | Note: pod ready time does not include the image pull time, and response time is computed as 95p over 200 requests with 10 seconds period.
79 |
80 | | Endpoint | Pod ready time in seconds | Response time in ms |
81 | |----------|---------------------------|---------------------|
82 | | / | 65 | 3.0286 |
83 | | /health | 62 | 3.7525 |
84 |
85 | ## Rejected alternatives
86 |
87 | Switch to the `/health` endpoint while still supporting Kafka 3.8.
88 | This would allow the new feature to be adopted earlier, but with the cost of additional complexity.
89 |
--------------------------------------------------------------------------------
/090-support-dns-config.md:
--------------------------------------------------------------------------------
1 | # Support DNS configuration
2 |
3 | This proposal describes the motivation for adding optional nameserver configuration to Strimzi custom resources.
4 |
5 | ## Current situation
6 |
7 | Strimzi relies on the default DNS settings provided by Kubernetes or OpenShift pods.
8 | By default, these pods use a DNS policy called `ClusterFirst`, which prioritizes resolving names within the Kubernetes cluster.
9 | No additional DNS configuration is specified at the pod level.
10 | These defaults resolve domain names within the cluster first, before checking external sources.
11 | For domains that don't match the cluster suffix, queries are forwarded to the upstream DNS servers defined by the node or CoreDNS configuration.
12 | Strimzi resources thereby don't currently expose configuration that allows customization of name resolution on a resource level.
13 |
14 | ## Motivation
15 |
16 | Certain configuration of Kubernetes/Openshift clusters require Strimzi resources that are deployed within different namespaces/projects to forward DNS queries for external resources to different nameservers than the one that is configured centralized in CoreDNS or on the node hosting the resource.
17 | For example a dev and stage environment/namespace/project hosted in the same Kubernetes/Openshift cluster might need to target seperate target clusters for similar subdomains but are not able to override the nameserver used as this is controlled centrally for both namespaces/projects.
18 | By supporting the customization of nameserver configuration for Strimzi resources on the level of the resource this will allow the targeting of specific nameservers.
19 | This supports scenarios where resources in different namespaces/projects—while sharing the source cluster network—can still target matching target environments on separate, segregated networks.
20 |
21 | ## Proposal
22 |
23 | By adding `dnsPolicy` and `dnsConfig` properties to Strimzi's common `PodTemplate` in the Strimzi API, users can control domain name resolution for Strimzi resources.
24 |
25 | - `dnsPolicy` can specify resolution policies like `ClusterFirst`, `ClusterFirstWithHostNet`, `Default`, and `None`.
26 | - `dnsConfig` can allow users to define nameservers, search domains, and resolution options.
27 |
28 | This provides more control over the `/etc/resolv.conf` configuration for Strimzi resources, allowing domain name resolution to be customized at the resource level.
29 | Strimzi's CRDs generated from from PodTemplate will thereby allow for user input for name resolution customization.
30 | By appending these dnsConfig and dnsPolicy properties to WorkloadUtils in the Strimzi cluster operator for PodBuilder and PodTemplateSpecBuilder so that these will propagate to Pods controlled by the operator, the propagation from Strimzi CRD to the Pod resource will be accounted for.
31 | Provisioned Pods will thereby utilize name resolution configuration based on the user input if specified and fall back to defaults, similar to the current situation, in case no dnsConfig or dnsPolicy is specified by the user.
32 |
33 | An example configuration, depending on the specific environment, could look as follows:
34 |
35 | ```yaml
36 | kind: KafkaMirrorMaker2
37 | spec:
38 | template:
39 | pod:
40 | dnsPolicy: "None"
41 | dnsConfig:
42 | nameservers:
43 | - 172.30.127.142
44 | options:
45 | - name: ndots
46 | value: "5"
47 | searches:
48 | - dev.svc.cluster.local
49 | - svc.cluster.local
50 | - cluster.local
51 | ```
52 | When the Pod above is created, the container gets the following contents in its /etc/resolv.conf file:
53 |
54 | ```
55 | search dev.svc.cluster.local svc.cluster.local cluster.local
56 | nameserver 172.30.127.142
57 | options ndots:5
58 | ```
59 |
60 | ## Affected/not affected projects
61 |
62 | Affected:
63 | - `api` module in the operator project
64 | - `cluster-operator` module in the operator project
65 |
66 | ## Compatibility
67 |
68 | Non-specified dnsConfig and dnsPolicy will not propagate these properties to the Pod resources and thereby comply to the current situation.
69 |
--------------------------------------------------------------------------------
/091-add-connect-to-test-container.md:
--------------------------------------------------------------------------------
1 | # Support running Kafka Connect in test-container
2 |
3 | The Strimzi [test-container](https://github.com/strimzi/test-container) library allows running Kafka clusters in containers. This is useful for integration and system tests that require a cluster.
4 |
5 | This proposes adding support for running Kafka Connect clusters.
6 |
7 | ## Motivation
8 |
9 | With [test-container](https://github.com/strimzi/test-container) and [test-clients](https://github.com/strimzi/test-clients), it's possible to build test environments for all Apache Kafka components but Kafka Connect.
10 |
11 | Being able to easily start Kafka Connect clusters would be useful for testing client-side components, such as [metrics-reporter](https://github.com/strimzi/metrics-reporter) and [strimzi-kafka-oauth](https://github.com/strimzi/strimzi-kafka-oauth), with Kafka Connect.
12 |
13 | ## Proposal
14 |
15 | Create one new public class in the `io.strimzi.test.container` package of `test-container` called `StrimziConnectCluster` to represent a Kafka Connect cluster.
16 |
17 | ### StrimziConnectCluster API
18 |
19 | ```java
20 | /**
21 | * A Kafka Connect cluster using the latest image from quay.io/strimzi/kafka with the given version.
22 | * Kafka Connect is started in distributed mode. Users must use the exposed REST API to start, stop and manage connectors.
23 | */
24 | public class StrimziConnectCluster {
25 |
26 | /**
27 | * Get the workers of this Kafka Connect cluster.
28 | *
29 | * @return collection of GenericContainer containers
30 | */
31 | public Collection getWorkers() { }
32 |
33 | /**
34 | * Start the Kafka Connect cluster.
35 | * This starts all the workers and waits for them to all be healthy and ready to be used.
36 | */
37 | public void start() { }
38 |
39 | /**
40 | * Stop the Kafka Connect cluster.
41 | */
42 | public void stop() { }
43 |
44 | /**
45 | * Return the REST API endpoint of one of the available workers.
46 | *
47 | * @return the REST API endpoint
48 | */
49 | public String getRestEndpoint() { }
50 |
51 | /**
52 | * Builder class for {@code StrimziConnectCluster}.
53 | *
54 | * Use this builder to create instances of {@code StrimziConnectCluster}.
55 | * You must at least call {@link #withKafkaCluster(StrimziKafkaCluster)}, and
56 | * {@link #withGroupId(String)} before calling {@link #build()}.
57 | *
58 | */
59 | public static class StrimziConnectClusterBuilder {
60 |
61 | /**
62 | * Set the Kafka cluster the Kafka Connect cluster will use to.
63 | *
64 | * @param kafkaCluster the {@link StrimziKafkaCluster} instance
65 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
66 | */
67 | public StrimziConnectClusterBuilder withKafkaCluster(StrimziKafkaCluster kafkaCluster) { }
68 |
69 | /**
70 | * Set the number of Kafka Connect workers in the cluster.
71 | * If not called, the cluster has a single worker.
72 | *
73 | * @param workersNum the number of Kafka Connect workers
74 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
75 | */
76 | public StrimziConnectClusterBuilder withNumberOfWorkers(int workersNum) { }
77 |
78 | /**
79 | * Add additional Kafka Connect configuration parameters.
80 | * These configurations are applied to all workers in the cluster.
81 | *
82 | * @param additionalConnectConfiguration a map of additional Kafka Connect configuration options
83 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
84 | */
85 | public StrimziConnectClusterBuilder withAdditionalConnectConfiguration(Map additionalConnectConfiguration) { }
86 |
87 | /**
88 | * Specify the Kafka version to be used for the Connect workers in the cluster.
89 | * If not called, the latest Kafka version available from {@link KafkaVersionService} will be used.
90 | *
91 | * @param kafkaVersion the desired Kafka version for the Connect cluster
92 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
93 | */
94 | public StrimziConnectClusterBuilder withKafkaVersion(String kafkaVersion) { }
95 |
96 | /**
97 | * Disable the FileStreams connectors.
98 | * If not called, the FileSteams connectors are added to plugin.path.
99 | *
100 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
101 | */
102 | public StrimziConnectClusterBuilder withoutFileConnectors() { }
103 |
104 | /**
105 | * Specify the group.id of the Connect cluster.
106 | *
107 | * @param groupId the group id
108 | * @return the current instance of {@code StrimziConnectClusterBuilder} for method chaining
109 | */
110 | public StrimziConnectClusterBuilder withGroupId(String groupId) { }
111 |
112 | /**
113 | * Build and return a {@code StrimziConnectCluster} instance based on the provided configurations.
114 | *
115 | * @return a new instance of {@code StrimziConnectCluster}
116 | */
117 | public StrimziConnectCluster build() { }
118 | }
119 | }
120 | ```
121 |
122 | ### StrimziKafkaCluster
123 |
124 | At the moment the bootstrap servers returned by `StrimziKafkaCluster.getBootstrapServers()` are meant to be used by applications running the host and they can't be used by other containers.
125 | To address this issue, this also proposes adding a new method to `StrimziKafkaCluster`:
126 |
127 | ```java
128 | /**
129 | * Get the bootstrap servers that containers on the same network should use to connect.
130 | *
131 | * @return a comma separated list of Kafka bootstrap servers
132 | */
133 | public String getNetworkBootstrapServers() { }
134 | ```
135 |
136 | This method will call `getNetworkBootstrapServers()` on each broker (`StrimziKafkaContainer`) in the cluster and concatenate the results (comma separated).
137 |
138 | ### StrimziKafkaContainer
139 |
140 | At the moment the bootstrap servers returned by `StrimziKafkaContainer.getBootstrapServers()` are meant to be used by applications running the host and they can't be used by other containers.
141 | To address this issue, this also proposes adding a new method to `StrimziKafkaContainer`:
142 |
143 | ```java
144 | /**
145 | * Get the bootstrap servers that containers on the same network should use to connect.
146 | *
147 | * @return a comma separated list of Kafka bootstrap servers
148 | */
149 | public String getNetworkBootstrapServers() { }
150 | ```
151 |
152 |
153 | ## Affected projects
154 |
155 | This proposal affects the Strimzi [test-container](https://github.com/strimzi/test-container) project only. The plan is to use this new feature in [metrics-reporter](https://github.com/strimzi/metrics-reporter).
156 |
157 | ## Backwards compatibility
158 |
159 | There is no impact on backwards compatibility.
160 |
161 | ## Rejected alternatives
162 |
163 | There are currently no rejected alternatives.
164 |
--------------------------------------------------------------------------------
/094-deprecate-secrets-field-in-custom-server-authentication.md:
--------------------------------------------------------------------------------
1 | # Deprecate `secrets` field in `type: custom` authentication in `Kafka` CR
2 |
3 | This proposal suggests to deprecate and later remove the `secrets` field of the `type: custom` authentication in the Kafka custom resource.
4 |
5 | ## Motivation
6 |
7 | When using [`type: custom` authentication](https://strimzi.io/docs/operators/latest/full/configuring.html#type-KafkaListenerAuthenticationCustom-reference) (introduced in Strimzi 0.28) in Kafka brokers, users can mount any Secret resources that are used by their custom authentication mechanism.
8 | These Secrets are mounted into the `/opt/kafka/custom-authn-secrets/custom-listener--/` directory and users can reference them by file path in their configuration.
9 | Strimzi does not do any validation or sanitization of their content and does not have any special handling for these Secrets.
10 | It just mounts them into the container.
11 |
12 | Recently, we introduced support for mounting custom volumes to any Strimzi container.
13 | This feature was introduced by [SP075 - Support for additional volumes](https://github.com/strimzi/proposals/blob/main/075-additional-volumes-support.md) in Strimzi 0.43.
14 | It allows the mounting of Secrets, but also Config Maps, PVCs or CSI volumes.
15 | Similarly to the Secrets from the `type: custom` authentication, Strimzi does not do any validation or sanitization of their content and does not have any special handling for these Secrets.
16 | They are just mounted into the `/mnt` path where they can be used by the user.
17 |
18 | The additional volumes feature can be used to replace the `secrets` field in the `type: custom` authentication.
19 | Having two different features to cover the same thing is unnecessary
20 | * It creates more complex API and bigger CRD(s)
21 | * It causes more complex / duplicate code for adding the volumes to Pods
22 | * It causes more complex / duplicate code for adding the volume mounts to containers
23 |
24 | ## Proposal
25 |
26 | This proposal suggests to immediately:
27 | * Deprecate the `secrets` field in `type: custom` authentication object
28 | * Update the documentation to not use this deprecated field and use the additional volumes instead
29 | * Update the `CHANGELOG.md` file and documentation to inform users about this deprecation
30 | * Have warnings raised by the Cluster Operator when the deprecated field is used
31 |
32 | While deprecated, the `secrets` field will continue to work as before deprecation.
33 |
34 | Later, the `secrets` field will be removed in the Strimzi `v1` CRD API.
35 | And once the field is completely removed from the Strimzi API (i.e. after we drop support for `v1beta2` API), the code using the field will be removed as well.
36 |
37 | ### Why deprecate only this field?
38 |
39 | This proposal targets the deprecation and removal of the `secrets` field in the `type: custom` authentication because it is used to mount opaque Secrets without any validation or special handling and as such it provides the same functionality as the additional volumes.
40 | This is similar to how we already deprecated the external configuration volumes in Kafka Connect.
41 |
42 | This proposal does not aim to deprecate any fields used to mount specific Secrets such as:
43 | * Server certificates
44 | * Trusted certificates
45 | * Client certificates
46 | * OAuth client IDs or Secrets
47 | * Passwords
48 |
49 | In these places we usually do additional validations, handle reloading of these Secrets etc.
50 | This cannot be replaced by the additional volumes functionality.
51 |
52 | ## Affected projects
53 |
54 | This proposal affects the Strimzi Cluster Operator only.
55 |
56 | ## Backwards compatibility
57 |
58 | There is no impact on backwards compatibility.
59 |
60 | ## Rejected alternatives
61 |
62 | There are currently no rejected alternatives.
63 |
--------------------------------------------------------------------------------
/095-add-support-volumeattributesclassname.md:
--------------------------------------------------------------------------------
1 | # Add `volumeAttributesClassName` to the storage configuration
2 |
3 | Volumes in Strimzi are configured using the `PersistentClaimStorage` model. This is part of the Strimzi API's `Kafka` and `KafkaNodePool` among others. This proposal goes over the ability to set `volumeAttributesClassName` in the configuration for `PersistentClaimStorage`. `VolumeAttributesClass` provides the ability to decouple storage parameters like IOPS, throughput, fstype or any other cloud specific ones from the `StorageClass`.
4 |
5 | ## Current situation
6 |
7 | It is not possible to set the `volumeAttributesClassName` using [`PersistentClaimStorage`](https://github.com/strimzi/strimzi-kafka-operator/blob/c1b20f726dddbcd2a070c2eeb14fd30902027aec/api/src/main/java/io/strimzi/api/kafka/model/kafka/PersistentClaimStorage.java). To change any configuration parameters of the physical volume backing a `PersistentVolumeClaim`, users have to rely on the `StorageClass`'s `spec.parameters` field. This couples the parameters and the class.
8 |
9 | ## Motivation
10 |
11 | Kubernetes v1.31 added a new method of configuring storage parameters for `PersistentVolumes` (PV) using [`VolumeAttributesClass`](https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/) (VAC). These VACs are a cluster-wide resource. `PersistentVolumeClaims` (PVC) can then refer to them using `volumeAttributesClassName` along with their corresponding `storageClassName`. This decouples storage parameters specification from the `StorageClass` (SC) into the VAC. In a PVC, the `storageClassName` field is immutable, whereas the `volumeAttributesClassName` isn't. This makes it possible to dynamically reconfigure the PV without losing data.
12 |
13 | ## Proposal
14 |
15 | To accommodate this change, the `PersistentClaimStorage` (PCS) API needs an additional string field `volumeAttributesClass`. When this field changes in the PCS, the Cluster Operator (CO) can map it to the generated PVC's `volumeAttributesClassName` and let the CSI Driver take care of provisioning. There shouldn't be a need to validate the VAC since the parameter names depend on the cloud provider.
16 |
17 | The [`external-provisioner`](https://github.com/kubernetes-csi/external-provisioner) `csi-provisioner` usually checks whether the `driverName` in the VAC is the same as the `provisioner` in the SC. The operator could check that pre-emptively, but cannot depend on the CSI driver to use that implementation of a csi-provisioner. The user would be responsible for making sure the VACs and the SCs are configured correctly before configuring the PCS.
18 |
19 | An example configuration could be
20 |
21 | ```
22 | apiVersion: kafka.strimzi.io/v1beta2
23 | kind: KafkaNodePool
24 | metadata:
25 | name: pool-a
26 | labels:
27 | strimzi.io/cluster: my-cluster
28 | spec:
29 | replicas: 3
30 | roles:
31 | - broker
32 | resources:
33 | requests:
34 | cpu: 1
35 | memory: 512Gi
36 | limits:
37 | cpu: 500m
38 | memory: 256Gi
39 | storage:
40 | type: persistent-claim
41 | size: 16Gi
42 | class: ebs
43 | volumeAttributesClass: ebs-fast
44 | deleteClaim: true
45 | ```
46 |
47 | which produces the following PVC
48 |
49 | ```
50 | apiVersion: v1
51 | kind: PersistentVolumeClaim
52 | metadata:
53 | name:
54 | spec:
55 | accessMode:
56 | - ReadWriteOnce
57 | volumeMode: Filesystem
58 | resources:
59 | request:
60 | storage: 16Gi
61 | storageClassName: ebs
62 | volumeAttributesClassName: ebs-fast
63 | ```
64 |
65 | Here the user is expected to have a VAC named ebs-fast which has the same `driverName` as the `provisioner` of the SC named ebs. They are also expected to have the right parameters for the corresponding cloud provider's CSI driver.
66 |
67 | When a user decides to update the `volumeAttributesClassName` of a PVC, the CSI driver will apply the changes. This update will ideally happen via the PCS which in-turn updates the PVC. This might result in a cloud provider error being thrown as well. This error will be an event on the corresponding PVC's status. In that case, it is expected that the user deals with fixing any errors with the VAC and SC. For example, the AWS EBS CSI driver posts this error when changing the VAC right after provisioning it and the user is expected to fix it.
68 |
69 | ```
70 | Warning VolumeModifyFailed 26s (x5 over 66s) external-resizer ebs.csi.aws.com rpc error: code = Internal desc = Could not modify volume "vol-0fa7ff557865862c2": volume "vol-0fa7ff557865862c2" in OPTIMIZING state, cannot currently modify
71 | ```
72 |
73 | ## Affected/not affected projects
74 |
75 | Cluster Operator is the only resource affected.
76 |
77 | ## Compatibility
78 |
79 | No issues expected. The `volumeAttributesClassName` is optional in a PVC.
80 |
81 |
--------------------------------------------------------------------------------
/096-split-metrics-reporter-into-modules.md:
--------------------------------------------------------------------------------
1 | # Split metrics-reporter into client and server modules
2 |
3 | ## Current situation
4 |
5 | The metrics-reporter is currently a single artifact (JAR) that can be used with both Apache Kafka clients and servers.
6 |
7 | ## Motivation
8 |
9 | Having a single artifact is simple and works well for the basic functionality. We're now trying to add more advanced features and it's proved hard to keep the same logic for both client and server.
10 |
11 | For example [support for dynamic configurations](https://github.com/strimzi/metrics-reporter/issues/55) is a feature only available for servers. Adding support for this feature is difficult without impacting the client logic. I opened a PoC [PR](https://github.com/strimzi/metrics-reporter/pull/64), and while I got it to work, this caused a lot of complexity due to `KafkaPrometheusMetricsReporter` needing to support both clients and servers.
12 |
13 | Another benefit of having separate modules is the ability to enforce dependencies. Depending on where the reporter runs, a Kafka client or a Kafka server, different dependencies are available at runtime. We need to make sure that logic used by clients does not load any classes only available on servers. We hit this issue in the past ([issue #48](https://github.com/strimzi/metrics-reporter/issues/48)). This is not something that can easily be tested so currently it relies on maintainers being diligent when making code changes to ensure we don't hit this issue again in the future.
14 |
15 | I'm also considering adding support for [KIP-714](https://cwiki.apache.org/confluence/display/KAFKA/KIP-714%3A+Client+metrics+and+observability) to metrics-reporter. Again this feature is only implemented by server-side metrics reporters. Adding support for this feature will be addressed in a separate proposal.
16 |
17 | ## Proposal
18 |
19 | I propose splitting the project into 2 Java modules:
20 |
21 | 1. `client-metrics-reporter`: This module will only depend on kafka-clients and the Prometheus libraries. It will be used by Apache Kafka clients (Producer, Admin, Consumer, Connect and Streams) by setting the `metric.reporters` configuration to `io.strimzi.kafka.metrics.prometheus.ClientMetricsReporter`. All the existing metrics-reporter configurations will stay the same. The differences are:
22 | - the reporter class name (it used to be `io.strimzi.kafka.metrics.KafkaPrometheusMetricsReporter`)
23 | - the dependency to add to the classpath
24 |
25 | This reporter will use its metric context to validate it runs in a client and will fail at startup if it's not the case.
26 |
27 | 2. `server-metrics-reporter`: This module will depend on the client-metrics-reporter module and also on Apache Kafka server JARs required to capture Yammer metrics (as described in [Proposal #64](https://github.com/strimzi/proposals/blob/main/064-prometheus-metrics-reporter.md)). It can be used by Apache Kafka servers (brokers and controllers) by setting the `metric.reporters` configuration to `io.strimzi.kafka.metrics.prometheus.ServerKafkaMetricsReporter` and the `kafka.metrics.reporters` configuration to `io.strimzi.kafka.metrics.prometheus.ServerYammerMetricsReporter`. All the existing metrics-reporter configurations will stay the same. The differences are:
28 | - the reporters class names (it used to be `io.strimzi.kafka.metrics.KafkaPrometheusMetricsReporter` and `io.strimzi.kafka.metrics.YammerPrometheusMetricsReporter`)
29 | - the dependencies to add to the classpath
30 |
31 | The project will publish 2 artifacts, one per module. The build will also produce archives including all the dependencies:
32 | - The client-metrics-reporter archive will contain the client-metrics-reporter JAR and all its dependencies
33 | - The server-metrics-reporter archive will contain the server-metrics-reporter JAR and all its dependencies (including the client-metrics-reporter JAR)
34 | ## Affected/not affected projects
35 |
36 | In addition of `metrics-reporter`, this will also impact:
37 |
38 | - `strimzi-kafka-operator`: There is a PR ongoing to add support for the metrics-reporter. This will need to be updated to use the new class names and dependency.
39 | - `strimzi-kafka-bridge`: The proposal to add support for the metrics-reporter is not impacted, but the implementation will need to be updated to use the new class names and dependency.
40 |
41 | ## Compatibility
42 |
43 | Upgrading from 0.1.0 to a newer release of the metrics-reporter will require manual changes. Users will have to download the right dependencies depending if they are using the reporter with clients or servers and update their client or servers configurations to use the new class names.
44 |
45 | Since the project is still in early access, now is the time if we want to make breaking changes. It will be much harder to do so once the reporter is supported by other projects (`strimzi-kafka-operator`, `strimzi-kafka-bridge`) and starts being used by Strimzi users.
46 |
47 | ## Rejected alternatives
48 |
49 | - Keep a single module: It's _possible_ to implement [support for dynamic configurations](https://github.com/strimzi/metrics-reporter/issues/55) and [support for KIP-714](https://github.com/strimzi/metrics-reporter/issues/72) but this causes a lot of complexity.
--------------------------------------------------------------------------------
/097-deprecate-OPA-authorization.md:
--------------------------------------------------------------------------------
1 | # Deprecate and remove `type: opa` authorization in Kafka CR
2 |
3 | This proposal suggests to deprecate and later remove the Open Policy Agent (OPA) authorization (`type: opa`).
4 |
5 | ## Current situation
6 |
7 | Strimzi currently supports [Open Policy Agent authorization plugin](https://github.com/StyraInc/opa-kafka-plugin) for Kafka brokers.
8 | The plugin is bundled as part of our Apache Kafka container images.
9 | Users can configure it in the `Kafka` custom resource using the `type: opa` authorization.
10 | For example:
11 |
12 | ```yaml
13 | authorization:
14 | type: opa
15 | url: http://opa:8181/v1/data/kafka/crd/authz/allow
16 | expireAfterMs: 60000
17 | superUsers:
18 | - my-super-user
19 | ```
20 |
21 | ## Motivation
22 |
23 | Supporting the `type: opa` authorization and bundling the plugin in our images is not for free:
24 | * We need to maintain the code in the Cluster Operator
25 | * With every new Kafka release, we need to make sure the dependencies are aligned between Kafka, other plugins and the OPA Authorizer plugin
26 | * We need to maintain the system tests and make sure we use an reasonably up-to-date OPA version
27 |
28 | While some users appear to be using the `type: opa` authorization, it does not seem to be widely adopted.
29 | For the users using the OPA authorizer, there is also a possible workaround.
30 | They can continue using the OPA authorizer plugin even after we remove the dedicated support for it by following these steps:
31 | 1. Add the OPA authorizer plugin to the Kafka container image
32 | 2. Use the `type: custom` authorization to configure the OPA authorizer.
33 | The OPA authorizer class will be specified as part of the `authorization` section.
34 | The additional options can be specified in the `config` section.
35 | For example:
36 | ```yaml
37 | # ...
38 | kafka:
39 | # ...
40 | authorization:
41 | type: custom
42 | authorizerClass: org.openpolicyagent.kafka.OpaAuthorizer
43 | superUsers:
44 | - my-super-user
45 | config:
46 | opa.authorizer.url: http://opa:8181/v1/data/kafka/crd/authz/allow
47 | opa.authorizer.cache.expire.after.seconds: 60
48 | # ...
49 | ```
50 |
51 | Given the available workaround and the maintenance effort, it seems reasonable to deprecate and remove the direct support for OPA authorizer from Strimzi.
52 | It also helps to make Strimzi project leaner and rely more on pluggability instead.
53 |
54 | ## Proposal
55 |
56 | This proposal suggests to immediately within Strimzi 0.46:
57 | * Deprecate the `type: opa` authorization
58 | * Update the documentation to not use this deprecated field and use the `type: custom` authorization instead
59 | * Update the `CHANGELOG.md` file and documentation to inform users about this deprecation
60 | * Have warnings raised by the Cluster Operator when the `type: opa` authorizer is used
61 |
62 | While deprecated, we will still continue bundling the OPA authorizer plugin as part of Strimzi.
63 |
64 | When the Strimzi `v1` CRD API is added, it will not support the `type: opa` anymore.
65 | But as the `type: opa` authorization will be still part of the `v1beta2` API, the support in Cluster Operator and in container images has to remain.
66 |
67 | Only in the first Strimzi version that drops the support for the `v1beta2` API and supports the `v1` API only, we will:
68 | * Stop bundling the OPA authorizer plugin in the Strimzi container images
69 | * Remove the production code for configuring the OPA authorization
70 | * Remove the OPA system test
71 | * Update the documentation to remove the `type: opa` authorization content.
72 |
73 | From this version on, users will have to use a custom container image to add the OPA authorizer plugin and the `type: custom` authorization to use it.
74 |
75 | ## Affected projects
76 |
77 | This proposal affects the Strimzi Cluster Operator, System Tests, and the documentation.
78 |
79 | ## Backwards compatibility
80 |
81 | Users using the `type: opa` authorization will be impacted by this changes as they will need to start using custom container images and update the Kafka CR resources.
82 | Other users will not be impacted.
83 |
84 | ## Rejected alternatives
85 |
86 | As an alternative path, we could consider dropping the OPA support completely already before the `v1` CRD API.
87 | For example drop the binaries and stop using the `type: opa` authorization already in an earlier Strimzi version such as Strimzi 0.48.
88 | However, I decided to start the proposal with the OPA authorization removal as part of the `v1beta2` API version removal.
89 |
--------------------------------------------------------------------------------
/099-drop-travis-ci-and-testing-for-ppc-and-s390x.md:
--------------------------------------------------------------------------------
1 | # Drop Travis-CI and testing efforts for `ppc64le` and `s390x` architectures
2 |
3 | This proposal suggests to drop current usage of [Travis CI](https://www.travis-ci.com/) in [strimzi-kafka-bridge](https://github.com/strimzi/strimzi-kafka-bridge) and [strimzi-kafka-oauth](https://github.com/strimzi/strimzi-kafka-oauth) repositories.
4 |
5 | ## Motivation
6 |
7 | In the past, we adopted Azure Pipelines as the primary CI system for the Operators repository, using it to run tests, build documentation, and create artifacts and images.
8 | The main reason for this transition was the improved resource quotas available for CNCF projects, which allowed us to run pipelines with far fewer restrictions than we had in Travis CI.
9 | However, Azure Pipelines does not provide agents for the `ppc64le` and `s390x` architectures.
10 | This was not an issue for Operators, as we build multi-architecture images on `amd64` using _docker buildx_.
11 | However, for Strimzi Kafka Bridge and Strimzi Kafka OAuth, we decided to retain the testing pipelines for `ppc64le` and `s390x`.
12 |
13 | In recent months, we have encountered several unexpected issues with Travis CI that required us to submit support tickets to restore our quota.
14 | These issues are often not immediately noticed when they occur but are only discovered weeks later.
15 | It seems that no one even realizes that some pipelines fail to execute from time to time.
16 | Another scenario is that jobs fail due to various Travis CI issues, yet we simply ignore the results and merge the PRs anyway.
17 |
18 | Such cases create an unexpected maintenance burden with little added value:
19 | - we must keep Travis CI configurations up to date, even though it is not our primary CI system and is only used for unit tests on the `ppc64le` and `s390x` architectures
20 | - we occasionally need to contact Travis CI support to resolve issues
21 | - no one actively reports bugs in `ppc64le` and `s390x` that could have been detected by pipelines running on Travis CI
22 | - we generally do not pay attention to unstable pipelines on Travis CI and merge PRs regardless
23 |
24 | _Note that Strimzi's Travis CI quota is sponsored by IBM._
25 |
26 | ## Proposal
27 |
28 | This proposal suggests removing the existing Travis CI configurations from the Strimzi Kafka Bridge and Strimzi Kafka OAuth repositories, effectively eliminating Travis CI usage from the Strimzi organization.
29 | From a CI perspective, we will be able to focus solely on Azure Pipelines, which has been the main CI system for Strimzi for the past few years.
30 | Regarding the current support matrix, we will not make any changes, meaning that all currently supported architectures — `amd64`, `arm64`, `ppc64le`, and `s390x` — will continue to be supported by the Strimzi project.
31 |
32 | For Strimzi Kafka Bridge, the affected pipeline includes a Java build with unit tests on the `ppc64le` and `s390x` architectures using OpenJDK 17.
33 | Its removal will not impact anything crucial, as artifacts are already built and tested as part of Azure Pipelines.
34 |
35 | The situation with Strimzi Kafka OAuth is slightly more complex because Azure Pipelines does not currently execute the integration test suite for the library.
36 | Before removing Travis CI from this repository, the integration test execution must first be migrated to Azure Pipelines.
37 | However, SpotBugs checks, artifact builds, and pushes are already handled by Azure Pipelines, meaning the only affected part is testing.
38 |
39 | The main part of Strimzi testing — system tests from the Operators repository — has never been executed on `ppc64le` or `s390x`.
40 | As a result, this proposal does not introduce any changes in that regard, and the overall quality of the Operators will remain unaffected.
41 |
42 | We will newly use the following terminology:
43 | - the `amd64` architecture will have the status `supported`, which means that all available tests will be run on this architecture on regular basis as we do now
44 | - the `ppc64le`, `s390x`, and `arm64` architectures will have the status `supported, not tested`, meaning that tests will not be regularly executed on these architectures
45 | As a result, we will add an architecture support matrix table to the project's `README.md` using the terminology described above.
46 |
47 | All reported bugs related to the `ppc64le`, `s390x`, and `arm64` architectures will be appropriately triaged during Community calls and resolved based on the triage outcome, without any changes to the current process.
48 |
49 | If accepted, we have to ensure that all testing is migrated from Travis CI to Azure Pipelines for unit, integration, and e2e tests running on `amd64` agents for every PR and commit into main branch for all affected repositories.
50 |
51 | ## Affected projects
52 |
53 | This proposal affects the Strimzi Kafka Bridge and Strimzi Kafka OAuth repositories only.
54 |
55 | ## Backwards compatibility
56 |
57 | There is no impact on backwards compatibility.
58 |
59 | ## Rejected alternatives
60 |
61 | Migrate Travis-CI workloads to [Testing Farm](https://docs.testing-farm.io/Testing%20Farm/0.1/index.html) that is Red Hat sponsored CI system.
62 | This could allow us to continue running the tests on `ppc64le` and `s390x`, and add `arm64` architectures, however, it will not remove the maintenance burden of another CI system.
63 | Overall, this alternative could end-up with the same situations as we are facing with Travis CI now.
64 |
--------------------------------------------------------------------------------
/101-redesign-restart-events.md:
--------------------------------------------------------------------------------
1 | # Redesign Restart Events
2 |
3 | Update the Kubernetes events that are emitted by Strimzi cluster operator when rolling Pods to list the Kafka, Connect or MM2 resource as the `involvedObject`.
4 |
5 | ## Current situation
6 |
7 | Currently, when the Kafka Pods are rolled, we issue Kubernetes Events describing the reason for the restart.
8 | This is only done for the Kafka, Connect and MM2 Pod restarts.
9 | The events are issued with the Pods as the `involvedObject`, for example:
10 |
11 | ```yaml
12 | action: StrimziInitiatedPodRestart
13 | # ...
14 | involvedObject:
15 | kind: Pod
16 | name: my-cluster-dual-role-0
17 | namespace: kafka
18 | kind: Event
19 | # ...
20 | message: Pod was manually annotated to be rolled
21 | # ...
22 | reason: ManualRollingUpdate
23 | reportingComponent: strimzi.io/cluster-operator
24 | reportingInstance: strimzi-cluster-operator-8d7bb7477-2dmxj
25 | # ...
26 | ```
27 |
28 | Users can filter for events emitted by the Strimzi cluster operator using:
29 | ```shell
30 | kubectl get events -n kafka --field-selector reportingComponent=strimzi.io/cluster-operator
31 | ```
32 |
33 | ## Motivation
34 |
35 | Pods have many events when they are restarted, so although the restart reason event is issued by Strimzi cluster operator, it is still easily lost among other kubelet events.
36 |
37 | ## Proposal
38 |
39 | The restart events emitted by the Strimzi cluster operator will be updated to have the `involvedObject` being the Kafka, Connect or MM2 resource, rather than the Pod being restarted.
40 | This would mean an example event would now look like:
41 |
42 | ```yaml
43 | action: StrimziInitiatedPodRestart
44 | # ...
45 | involvedObject: # (1)
46 | kind: Kafka
47 | name: my-cluster
48 | namespace: kafka
49 | kind: Event
50 | # ...
51 | message: Pod my-cluster-dual-role-0 was manually annotated to be rolled # (2)
52 | # ...
53 | reason: ManualRollingUpdate
54 | related: # (3)
55 | kind: Pod
56 | name: my-cluster-dual-role-0
57 | namespace: kafka
58 | reportingComponent: strimzi.io/cluster-operator
59 | reportingInstance: strimzi-cluster-operator-55d66bf7bd-htjtd
60 | # ...
61 | ```
62 |
63 | 1. The `regarding` field in the [Event API](https://kubernetes.io/docs/reference/kubernetes-api/cluster-resources/event-v1/#Event) is changed to the Kafka (or Connect or MM2) resource.
64 | The `regarding` field maps to `involvedObject` in the output.
65 | 2. The restart message will be updated to include the Pod name to make it easier to identify the affected Pod when listing events.
66 | 3. The `related` field is added pointing to the Pod that is being rolled.
67 |
68 | ## Affected/not affected projects
69 |
70 | This only affects the Strimzi cluster operator.
71 |
72 | ## Compatibility
73 |
74 | This will be a change for users, however since the events being emitted aren't versioned there is no clear way to indicate this change.
75 | The 0.46 release already includes several major changes like ZooKeeper and MirrorMaker 1 removal.
76 | As a result it is reasonable to assume for this release users will be closely reviewing the changelog, so we should aim to include this change in that release.
77 | If it misses the deadline we can wait for the v1 API as discussed in `Rejected alternatives`.
78 |
79 | ## Rejected alternatives
80 |
81 | ### Removing the restart events
82 |
83 | We could fully remove the restart events.
84 | There seem to be some users using them, but as far as we know it is a relatively small number of users.
85 |
86 | If we remove the events entirely users must view the logs to see why a Pod was restarted.
87 | Since the logs are fairly busy this can be hard to find, unless using a dedicated logging collection tool.
88 | Even though the events do add some overhead, it isn't a great deal and they are a useful way to check why a Pod is restarting without having to trawl through logs.
89 |
90 | ### Waiting until the v1 API
91 |
92 | A previous version of this proposal suggested releasing this change at the same time as the v1 API.
93 | When the v1 API lands users are likely to be reviewing the changelog more thoroughly than for other releases, increasing the likelihood that they will not be caught out by the change.
94 | The same statement is true for the 0.46 release since that release has other major changes like ZooKeeper and MirrorMaker 1 removal.
95 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Strimzi Community Code of Conduct
2 |
3 | Strimzi Community Code of Conduct is defined in the [governance repository](https://github.com/strimzi/governance/blob/master/CODE_OF_CONDUCT.md).
--------------------------------------------------------------------------------
/GOVERNANCE.md:
--------------------------------------------------------------------------------
1 | # Strimzi Governance
2 |
3 | Strimzi Governance is defined in the [governance repository](https://github.com/strimzi/governance/blob/master/GOVERNANCE.md).
--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | # Strimzi Maintainers list
2 |
3 | Strimzi Maintainers list is defined in the [governance repository](https://github.com/strimzi/governance/blob/master/MAINTAINERS).
--------------------------------------------------------------------------------
/images/009-scram-sha512-admin-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/009-scram-sha512-admin-server.png
--------------------------------------------------------------------------------
/images/009-scram-sha512-ui-request-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/009-scram-sha512-ui-request-flow.png
--------------------------------------------------------------------------------
/images/011-deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/011-deployment.png
--------------------------------------------------------------------------------
/images/011-session-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/011-session-architecture.png
--------------------------------------------------------------------------------
/images/011-topicsdesign.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/011-topicsdesign.png
--------------------------------------------------------------------------------
/images/011-topology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/011-topology.png
--------------------------------------------------------------------------------
/images/017-kafkaenc-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/017-kafkaenc-overview.png
--------------------------------------------------------------------------------
/images/031-strimzi-with-statefulset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/031-strimzi-with-statefulset.png
--------------------------------------------------------------------------------
/images/031-strimzi-with-strimzipodset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/031-strimzi-with-strimzipodset.png
--------------------------------------------------------------------------------
/images/031-strimzipodset-controller.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/031-strimzipodset-controller.png
--------------------------------------------------------------------------------
/images/047-quota-plugin-interactions.excalidraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/047-quota-plugin-interactions.excalidraw
--------------------------------------------------------------------------------
/images/047-quota-plugin-interactions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/047-quota-plugin-interactions.png
--------------------------------------------------------------------------------
/images/048-kafka-roller-current-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/048-kafka-roller-current-flow.png
--------------------------------------------------------------------------------
/images/048-kafka-roller-new-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/048-kafka-roller-new-flow.png
--------------------------------------------------------------------------------
/images/051-states.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/051-states.png
--------------------------------------------------------------------------------
/images/059-rejected-zk-kraft-migration-fsm-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/059-rejected-zk-kraft-migration-fsm-1.png
--------------------------------------------------------------------------------
/images/059-rejected-zk-kraft-migration-fsm-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/059-rejected-zk-kraft-migration-fsm-2.png
--------------------------------------------------------------------------------
/images/059-zk-kraft-migration-fsm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/059-zk-kraft-migration-fsm.png
--------------------------------------------------------------------------------
/images/064-current.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/064-current.png
--------------------------------------------------------------------------------
/images/064-proposal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/064-proposal.png
--------------------------------------------------------------------------------
/images/100-cert-renewals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/100-cert-renewals.png
--------------------------------------------------------------------------------
/images/100-existing-renew-replace-clientca-certs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/100-existing-renew-replace-clientca-certs.png
--------------------------------------------------------------------------------
/images/100-new-cluster-ca-key-replacement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/100-new-cluster-ca-key-replacement.png
--------------------------------------------------------------------------------
/images/100-new-ee-certs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/100-new-ee-certs.png
--------------------------------------------------------------------------------
/images/100-new-renew-replace-clientsca-cert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/images/100-new-renew-replace-clientsca-cert.png
--------------------------------------------------------------------------------
/logo/cncf-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/logo/cncf-color.png
--------------------------------------------------------------------------------
/logo/strimzi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strimzi/proposals/0d88b6111b75abb68a34482e6fd88df84055a695/logo/strimzi.png
--------------------------------------------------------------------------------